Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; import org.jsoup.helper.Validate; import org.jsoup.nodes.Entities; import java.util.ArrayList; import java.util.List; /** * Readers the input stream into tokens. */ class Tokeniser { static final char replacementChar = '\uFFFD'; // replaces null character private CharacterReader reader; // html input private boolean trackErrors = true; private List<ParseError> errors = new ArrayList<ParseError>(); // errors found while tokenising private TokeniserState state = TokeniserState.Data; // current tokenisation state private Token emitPending; // the token we are about to emit on next read private boolean isEmitPending = false; private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token StringBuilder dataBuffer; // buffers data looking for </script> Token.Tag tagPending; // tag we are building up Token.Doctype doctypePending; // doctype building up Token.Comment commentPending; // comment building up private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag private boolean selfClosingFlagAcknowledged = true; Tokeniser(CharacterReader reader) { this.reader = reader; } Token read() { if (!selfClosingFlagAcknowledged) { error("Self closing flag not acknowledged"); selfClosingFlagAcknowledged = true; } while (!isEmitPending) state.read(this, reader); // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: if (charBuffer.length() > 0) { String str = charBuffer.toString(); charBuffer.delete(0, charBuffer.length()); return new Token.Character(str); } else { isEmitPending = false; return emitPending; } } void emit(Token token) { Validate.isFalse(isEmitPending, "There is an unread token pending!"); emitPending = token; isEmitPending = true; if (token.type == Token.TokenType.StartTag) { Token.StartTag startTag = (Token.StartTag) token; lastStartTag = startTag; if (startTag.selfClosing) selfClosingFlagAcknowledged = false; } else if (token.type == Token

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>.TokenType.EndTag) { Token.EndTag endTag = (Token.EndTag) token; if (endTag.attributes.size() > 0) error("Attributes incorrectly present on end tag"); } } void emit(String str) { // buffer strings up until last string token found, to emit only one token for a run of character refs etc. // does not set isEmitPending; read checks that charBuffer.append(str); } void emit(char c) { charBuffer.append(c); } TokeniserState getState() { return state; } void transition(TokeniserState state) { this.state = state; } void advanceTransition(TokeniserState state) { reader.advance(); this.state = state; } void acknowledgeSelfClosingFlag() { selfClosingFlagAcknowledged = true; } Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAny('\t', '\n', '\f', '<', '&')) return null; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError(); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError(); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException e) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError(); return replacementChar; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors return (char) charval; } } else { // named // get as many letters as possible, and

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> look for matching entities. unconsume backwards till a match is found String nameRef = reader.consumeLetterSequence(); boolean looksLegit = reader.matches(';'); boolean found = false; while (nameRef.length() > 0 && !found) { if (Entities.isNamedEntity(nameRef)) found = true; else { nameRef = nameRef.substring(0, nameRef.length()-1); reader.unconsume(); } } if (!found) { if (looksLegit) // named with semicolon characterReferenceError(); reader.rewindToMark(); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matches('='))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError(); // missing semi return Entities.getCharacterByName(nameRef); } } Token.Tag createTagPending(boolean start) { tagPending = start ? new Token.StartTag() : new Token.EndTag(); return tagPending; } void emitTagPending() { tagPending.finaliseTag(); emit(tagPending); } errors.add(new ParseError("Unexpectedly reached end of file (EOF)", state, reader.pos())); } private void characterReferenceError() { if (trackErrors) errors.add(new ParseError("Invalid character reference", reader.pos())); } private void error(String errorMsg) { if (trackErrors) errors.add(new ParseError(errorMsg, reader.pos())); } boolean currentNodeInHtmlNS() { // todo: implememnt namespaces correctly return true; // Element currentNode = currentNode(); // return currentNode != null && currentNode.namespace().equals("HTML"); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; /** * States and transition activations for the Tokeniser. */ enum TokeniserState { Data { // in data state, gather characters until a character reference or tag is found void read(Tokeniser t, CharacterReader r) { switch (r.current()) { case '&': t.advanceTransition(CharacterReferenceInData); break; case '<': t.advanceTransition(TagOpen); break; case nullChar: t.error(this); // NOT replacement character (oddly?) t.emit(r.consume()); break; case eof: t.emit(new Token.EOF()); break; default: String data = r.consumeToAny('&', '<', nullChar); t.emit(data); break; } } }, CharacterReferenceInData { // from & in data void read(Tokeniser t, CharacterReader r) { Character c = t.consumeCharacterReference(null, false); if (c == null) t.emit('&'); else t.emit(c); t.transition(Data); } }, Rcdata { /// handles data in title, textarea etc void read(Tokeniser t, CharacterReader r) { switch (r.current()) { case '&': t.advanceTransition(CharacterReferenceInRcdata); break; case '<': t.advanceTransition(RcdataLessthanSign); break; case nullChar: t.error(this); r.advance(); t.emit(replacementChar); break; case eof: t.emit(new Token.EOF()); break; default: String data = r.consumeToAny('&', '<', nullChar); t.emit(data); break; } } }, CharacterReferenceInRcdata { void read(Tokeniser t, CharacterReader r) { Character c = t.consumeCharacterReference(null, false); if (c == null) t.emit('&'); else t.emit(c); t.transition(Rcdata); } }, Rawtext { void read(Tokeniser t, CharacterReader r) { switch (r.current()) { case '<': t.advanceTransition(RawtextLessthanSign); break; case nullChar: t.

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>error(this); r.advance(); t.emit(replacementChar); break; case eof: t.emit(new Token.EOF()); break; default: String data = r.consumeToAny('<', nullChar); t.emit(data); break; } } }, ScriptData { void read(Tokeniser t, CharacterReader r) { switch (r.current()) { case '<': t.advanceTransition(ScriptDataLessthanSign); break; case nullChar: t.error(this); r.advance(); t.emit(replacementChar); break; case eof: t.emit(new Token.EOF()); break; default: String data = r.consumeToAny('<', nullChar); t.emit(data); break; } } }, PLAINTEXT { void read(Tokeniser t, CharacterReader r) { switch (r.current()) { case nullChar: t.error(this); r.advance(); t.emit(replacementChar); break; case eof: t.emit(new Token.EOF()); break; default: String data = r.consumeTo(nullChar); t.emit(data); break; } } }, TagOpen { // from < in data void read(Tokeniser t, CharacterReader r) { switch (r.current()) { case '!': t.advanceTransition(MarkupDeclarationOpen); break; case '/': t.advanceTransition(EndTagOpen); break; case '?': t.advanceTransition(BogusComment); break; default: if (r.matchesLetter()) { t.createTagPending(true); t.transition(TagName); } else { t.error(this); t.emit('<'); // char that got us here t.transition(Data); } break; } } }, EndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); t.emit("</"); t.transition(Data); } else if (r.matchesLetter()) { t.createTagPending(false); t.transition(TagName); } else if (r.matches('>

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>')) { t.error(this); t.advanceTransition(Data); } else { t.error(this); t.advanceTransition(BogusComment); } } }, TagName { // from < or </ in data, will have start or end tag pending void read(Tokeniser t, CharacterReader r) { // previous TagOpen state did NOT consume, will have a letter char in current String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', nullChar).toLowerCase(); t.tagPending.appendTagName(tagName); switch (r.consume()) { case '\t': case '\n': case '\f': case ' ': t.transition(BeforeAttributeName); break; case '/': t.transition(SelfClosingStartTag); break; case '>': t.emitTagPending(); t.advanceTransition(RCDATAEndTagName); } else { t.emit("</"); t.transition(Rcdata); } } }, RCDATAEndTagName { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); t.tagPending.appendTagName(name.toLowerCase()); t.dataBuffer.append(name); return; } char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': if (t.isAppropriateEndTagToken()) t.transition(BeforeAttributeName); else anythingElse(t, r); break; case '/': if (t.isAppropriateEndTagToken()) t.transition(SelfClosingStartTag); else anythingElse(t, r); break; case '>': if (t.isAppropriateEndTagToken()) { t.emitTagPending(); t.transition(Data); } else anythingElse(t, r); break; default: anythingElse(t, r); } } private void anythingElse(Tokeniser t, CharacterReader r) { t.emit("</" + t.dataBuffer.toString()); t.transition(Rcdata); } }, RawtextLessthanSign { void read(Tokeniser t, CharacterReader r) { if (r

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>.matches('/')) { t.createTempBuffer(); t.advanceTransition(RawtextEndTagOpen); } else { t.emit('<'); t.transition(Rawtext); } } }, RawtextEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.transition(RawtextEndTagName); } else { t.emit("</"); t.transition(Rawtext); } } }, RawtextEndTagName { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); t.tagPending.appendTagName(name.toLowerCase()); t.dataBuffer.append(name); return; } char c = r.consume(); boolean handled = true; switch (c) { case '\t': case '\n': case '\f': case ' ': if (t.isAppropriateEndTagToken()) t.transition(BeforeAttributeName); else handled = false; break; case '/': if (t.isAppropriateEndTagToken()) t.transition(SelfClosingStartTag); else handled = false; break; case '>': if (t.isAppropriateEndTagToken()) { t.emitTagPending(); t.transition(Data); } else handled = false; break; } if (!handled) { t.emit("</" + t.dataBuffer.toString()); r.unconsume(); t.transition(Rawtext); } } }, ScriptDataLessthanSign { void read(Tokeniser t, CharacterReader r) { switch (r.consume()) { case '/': t.createTempBuffer(); t.transition(ScriptDataEndTagOpen); break; case '!': t.emit("<!"); t.transition(ScriptDataEscapeStart); break; default: t.emit("<"); r.unconsume(); t.transition(ScriptData); } } }, ScriptDataEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.transition(ScriptDataEndTagName);

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> } else { t.emit("</"); t.transition(ScriptData); } } }, ScriptDataEndTagName { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); t.tagPending.appendTagName(name.toLowerCase()); t.dataBuffer.append(name); return; } char c = r.consume(); boolean handled = true; switch (c) { case '\t': case '\n': case '\f': case ' ': if (t.isAppropriateEndTagToken()) t.transition(BeforeAttributeName); else handled = false; break; case '/': if (t.isAppropriateEndTagToken()) t.transition(SelfClosingStartTag); else handled = false; break; case '>': if (t.isAppropriateEndTagToken()) { t.emitTagPending(); t.transition(Data); } else handled = false; break; } if (!handled) { t.emit("</" + t.dataBuffer.toString()); r.unconsume(); t.transition(ScriptData); } } }, ScriptDataEscapeStart { void read(Tokeniser t, CharacterReader r) { if (r.matches('-')) { t.emit('-'); t.advanceTransition(ScriptDataEscapeStartDash); } else { t.transition(ScriptData); } } }, ScriptDataEscapeStartDash { void read(Tokeniser t, CharacterReader r) { if (r.matches('-')) { t.emit('-'); t.advanceTransition(ScriptDataEscapedDashDash); } else { t.transition(ScriptData); } } }, ScriptDataEscaped { void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); t.transition(Data); return; } switch (r.current()) { case '-': t.emit('-'); t.advanceTransition(ScriptDataEscapedDash); break; case '<': t.advanceTransition(ScriptDataEscapedLessthanSign); break; case nullChar: t.error(this); r.advance(); t.emit

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>(replacementChar); break; default: String data = r.consumeToAny('-', '<', nullChar); t.emit(data); } } }, ScriptDataEscapedDash { void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); t.transition(Data); return; } char c = r.consume(); switch (c) { case '-': t.emit(c); t.transition(ScriptDataEscapedDashDash); break; case '<': t.transition(ScriptDataEscapedLessthanSign); break; case nullChar: t.error(this); t.emit(replacementChar); t.transition(ScriptDataEscaped); break; default: t.emit(c); t.transition(ScriptDataEscaped); } } }, ScriptDataEscapedDashDash { void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); t.transition(Data); return; } char c = r.consume(); switch (c) { case '-': t.emit(c); break; case '<': t.transition(ScriptDataEscapedLessthanSign); break; case '>': t.emit(c); t.transition(ScriptData); break; case nullChar: t.error(this); t.emit(replacementChar); t.transition(ScriptDataEscaped); break; default: t.emit(c); t.transition(ScriptDataEscaped); } } }, ScriptDataEscapedLessthanSign { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTempBuffer(); t.dataBuffer.append(Character.toLowerCase(r.current())); t.emit("<" + r.current()); t.advanceTransition(ScriptDataDoubleEscapeStart); } else if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(ScriptDataEscapedEndTagOpen); } else { t.emit('<'); t.transition(ScriptDataEscaped); } } }, ScriptDataEscapedEndTagOpen { void

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(r.current()); t.advanceTransition(ScriptDataEscapedEndTagName); } else { t.emit("</"); t.transition(ScriptDataEscaped); } } }, ScriptDataEscapedEndTagName { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); t.tagPending.appendTagName(name.toLowerCase()); t.dataBuffer.append(name); r.advance(); return; } char c = r.consume(); boolean handled = true; switch (c) { case '\t': case '\n': case '\f': case ' ': if (t.isAppropriateEndTagToken()) t.transition(BeforeAttributeName); else handled = false; break; case '/': if (t.isAppropriateEndTagToken()) t.transition(SelfClosingStartTag); else handled = false; break; case '>': if (t.isAppropriateEndTagToken()) { t.emitTagPending(); t.transition(Data); } else handled = false; break; } if (!handled) { t.emit("</" + t.dataBuffer.toString()); r.unconsume(); t.transition(ScriptDataEscaped); } } }, ScriptDataDoubleEscapeStart { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); t.dataBuffer.append(name.toLowerCase()); t.emit(name); return; } char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': case '/': case '>': if (t.dataBuffer.toString().equals("script")) t.transition(ScriptDataDoubleEscaped); else t.transition(ScriptDataEscaped); t.emit(c); break; default: r.unconsume(); t.transition(ScriptDataEscaped);

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> } } }, ScriptDataDoubleEscaped { void read(Tokeniser t, CharacterReader r) { char c = r.current(); switch (c) { case '-': t.emit(c); t.advanceTransition(ScriptDataDoubleEscapedDash); break; case '<': t.emit(c); t.advanceTransition(ScriptDataDoubleEscapedLessthanSign); break; case nullChar: t.error(this); r.advance(); t.emit(replacementChar); break; case eof: t.eofError(this); t.transition(Data); break; default: String data = r.consumeToAny('-', '<', nullChar); t.emit(data); } } }, ScriptDataDoubleEscapedDash { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '-': t.emit(c); t.transition(ScriptDataDoubleEscapedDashDash); break; case '<': t.emit(c); t.transition(ScriptDataDoubleEscapedLessthanSign); break; case nullChar: t.error(this); t.emit(replacementChar); t.transition(ScriptDataDoubleEscaped); break; case eof: t.eofError(this); t.transition(Data); break; default: t.emit(c); t.transition(ScriptDataDoubleEscaped); } } }, ScriptDataDoubleEscapedDashDash { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '-': t.emit(c); break; case '<': t.emit(c); t.transition(ScriptDataDoubleEscapedLessthanSign); break; case '>': t.emit(c); t.transition(ScriptData); case nullChar: t.error(this); t.emit(replacementChar); t.transition(ScriptDataDoubleEscaped); break; case eof: t.eofError(this); t.transition(Data); break; default: t.emit(c); t.transition(ScriptDataDoubleEscaped); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> }, ScriptDataDoubleEscapedLessthanSign { void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.emit('/'); t.createTempBuffer(); t.advanceTransition(ScriptDataDoubleEscapeEnd); } else { t.transition(ScriptDataDoubleEscaped); } } }, ScriptDataDoubleEscapeEnd { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); t.dataBuffer.append(name.toLowerCase()); t.emit(name); return; } char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': case '/': case '>': if (t.dataBuffer.toString().equals("script")) t.transition(ScriptDataEscaped); else t.transition(ScriptDataDoubleEscaped); t.emit(c); break; default: r.unconsume(); t.transition(ScriptDataDoubleEscaped); } } }, BeforeAttributeName { // from tagname <xxx void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': break; // ignore whitespace case '/': t.transition(SelfClosingStartTag); break; case '>': t.emitTagPending(); t.transition(Data); break; case nullChar: t.error(this); t.tagPending.newAttribute(); r.unconsume(); t.transition(AttributeName); break; case eof: t.eofError(this); t.transition(Data); break; case '"': case '\'': case '<': case '=': t.error(this); t.tagPending.newAttribute(); t.tagPending.appendAttributeName(c); t.transition(AttributeName); break; default: // A-Z, anything else t.tagPending.newAttribute(); r.unconsume(); t.transition(AttributeName); } } }, AttributeName { // from before attribute name void read(Tokeniser t, CharacterReader r) { String

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<'); t.tagPending.appendAttributeName(name.toLowerCase()); char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': t.transition(AfterAttributeName); break; case '/': t.transition(SelfClosingStartTag); break; case '=': t.transition(BeforeAttributeValue); break; case '>': t.emitTagPending(); t.transition(Data); break; case nullChar: t.error(this); t.tagPending.appendAttributeName(replacementChar); break; case eof: t.eofError(this); t.transition(Data); break; case '"': case '\'': case '<': t.error(this); t.tagPending.appendAttributeName(c); // no default, as covered in consumeToAny } } }, AfterAttributeName { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': // ignore break; case '/': t.transition(SelfClosingStartTag); break; case '=': t.transition(BeforeAttributeValue); break; case '>': t.emitTagPending(); t.transition(Data); break; case nullChar: t.error(this); t.tagPending.appendAttributeName(replacementChar); t.transition(AttributeName); break; case eof: t.eofError(this); t.transition(Data); break; case '"': case '\'': case '<': t.error(this); t.tagPending.newAttribute(); t.tagPending.appendAttributeName(c); t.transition(AttributeName); break; default: // A-Z, anything else t.tagPending.newAttribute(); r.unconsume(); t.transition(AttributeName); } } }, BeforeAttributeValue { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>f': case ' ': // ignore break; case '"': t.transition(AttributeValue_doubleQuoted); break; case '&': r.unconsume(); t.transition(AttributeValue_unquoted); break; case '\'': t.transition(AttributeValue_singleQuoted); break; case nullChar: t.error(this); t.tagPending.appendAttributeValue(replacementChar); t.transition(AttributeValue_unquoted); break; case eof: t.eofError(this); t.transition(Data); break; case '>': t.error(this); t.emitTagPending(); t.transition(Data); break; case '<': case '=': case '`': t.error(this); t.tagPending.appendAttributeValue(c); t.transition(AttributeValue_unquoted); break; default: r.unconsume(); t.transition(AttributeValue_unquoted); } } }, AttributeValue_doubleQuoted { void read(Tokeniser t, CharacterReader r) { String value = r.consumeToAny('"', '&', nullChar); if (value.length() > 0) t.tagPending.appendAttributeValue(value); char c = r.consume(); switch (c) { case '"': t.transition(AfterAttributeValue_quoted); break; case '&': Character ref = t.consumeCharacterReference('"', true); if (ref != null) t.tagPending.appendAttributeValue(ref); else t.tagPending.appendAttributeValue('&'); break; case nullChar: t.error(this); t.tagPending.appendAttributeValue(replacementChar); break; case eof: t.eofError(this); t.transition(Data); break; // no default, handled in consume to any above } } }, AttributeValue_singleQuoted { void read(Tokeniser t, CharacterReader r) { String value = r.consumeToAny('\'', '&', nullChar); if (value.length() > 0) t.tagPending.appendAttributeValue(value); char c = r.consume(); switch (c) { case '\'': t.transition(AfterAttributeValue_quoted); break; case '&': Character ref = t.consumeCharacterReference('

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>\'', true); if (ref != null) t.tagPending.appendAttributeValue(ref); else t.tagPending.appendAttributeValue('&'); break; case nullChar: t.error(this); t.tagPending.appendAttributeValue(replacementChar); break; case eof: t.eofError(this); t.transition(Data); break; // no default, handled in consume to any above } } }, AttributeValue_unquoted { void read(Tokeniser t, CharacterReader r) { String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`'); if (value.length() > 0) t.tagPending.appendAttributeValue(value); char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': t.transition(BeforeAttributeName); break; case '&': Character ref = t.consumeCharacterReference('>', true); if (ref != null) t.tagPending.appendAttributeValue(ref); else t.tagPending.appendAttributeValue('&'); break; case '>': t.emitTagPending(); t.transition(Data); break; case nullChar: t.error(this); t.tagPending.appendAttributeValue(replacementChar); break; case eof: t.eofError(this); t.transition(Data); break; case '"': case '\'': case '<': case '=': case '`': t.error(this); t.tagPending.appendAttributeValue(c); break; // no default, handled in consume to any above } } }, // CharacterReferenceInAttributeValue state handled inline AfterAttributeValue_quoted { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': t.transition(BeforeAttributeName); break; case '/': t.transition(SelfClosingStartTag); break; case '>': t.emitTagPending(); t.transition(Data); break; case eof: t.eofError(this); t.transition(Data); break;

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> default: t.error(this); r.unconsume(); t.transition(BeforeAttributeName); } } }, SelfClosingStartTag { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '>': t.tagPending.selfClosing = true; t.emitTagPending(); t.transition(Data); break; case eof: t.eofError(this); t.transition(Data); break; default: t.error(this); t.transition(BeforeAttributeName); } } }, BogusComment { void read(Tokeniser t, CharacterReader r) { // todo: handle bogus comment starting from eof. when does that trigger? // rewind to capture character that lead us here r.unconsume(); Token.Comment comment = new Token.Comment(); comment.data.append(r.consumeTo('>')); // todo: replace nullChar with replaceChar t.emit(comment); t.advanceTransition(Data); } }, MarkupDeclarationOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchConsume("--")) { t.createCommentPending(); t.transition(CommentStart); } else if (r.matchConsumeIgnoreCase("DOCTYPE")) { t.transition(Doctype); } else if (r.matchConsume("[CDATA[")) { // todo: should actually check current namepspace, and only non-html allows cdata. until namespace // is implemented properly, keep handling as cdata //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) { t.transition(CdataSection); } else { t.error(this); t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind } } }, CommentStart { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '-': t.transition(CommentStartDash); break; case nullChar: t.error(this); t.commentPending.data.append(replacementChar); t.transition(Comment); break; case '>': t.error(this); t

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> (r.matchesLetter()) { t.createDoctypePending(); t.transition(DoctypeName); return; } char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': break; // ignore whitespace case nullChar: t.error(this); t.doctypePending.name.append(replacementChar); t.transition(DoctypeName); break; case eof: t.eofError(this); t.createDoctypePending(); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.createDoctypePending(); t.doctypePending.name.append(c); t.transition(DoctypeName); } } }, DoctypeName { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); t.doctypePending.name.append(name.toLowerCase()); return; } char c = r.consume(); switch (c) { case '>': t.emitDoctypePending(); t.transition(Data); break; case '\t': case '\n': case '\f': case ' ': t.transition(AfterDoctypeName); break; case nullChar: t.error(this); t.doctypePending.name.append(replacementChar); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.doctypePending.name.append(c); } } }, AfterDoctypeName { void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); return; } if (r.matches('>')) { t.emitDoctypePending(); t.advanceTransition(Data); } else if (r.matchConsumeIgnoreCase("PUBLIC")) { t.transition(AfterDoctypePublicKeyword); } else

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>': t.transition(AfterDoctypeSystemIdentifier); break; case nullChar: t.error(this); t.doctypePending.systemIdentifier.append(replacementChar); break; case '>': t.error(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.doctypePending.systemIdentifier.append(c); } } }, AfterDoctypeSystemIdentifier { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '\t': case '\n': case '\f': case ' ': break; case '>': t.emitDoctypePending(); t.transition(Data); break; case eof: t.eofError(this); t.doctypePending.forceQuirks = true; t.emitDoctypePending(); t.transition(Data); break; default: t.error(this); t.transition(BogusDoctype); // NOT force quirks } } }, BogusDoctype { void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { case '>': t.emitDoctypePending(); t.transition(Data); break; case eof: t.emitDoctypePending(); t.transition(Data); break; default: // ignore char break; } } }, CdataSection { void read(Tokeniser t, CharacterReader r) { String data = r.consumeTo("]]>"); t.emit(data); r.matchConsume("]]>"); t.transition(Data); } }; abstract void read(Tokeniser t, CharacterReader r); private static final char nullChar = '\u0000'; private static final char replacementChar = Tokeniser.replacementChar; private static final String replacementStr = String.valueOf(Tokeniser.replacementChar); private static final char eof = CharacterReader.EOF; }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; /** CharacterReader cosumes tokens off a string. To replace the old TokenQueue. */ class CharacterReader { static final char EOF = (char) -1; private final String input; private final int length; private int pos = 0; private int mark = 0; CharacterReader(String input) { this.input = input; this.length = input.length(); } int pos() { return pos; } boolean isEmpty() { return pos >= length; } char current() { return isEmpty() ? EOF : input.charAt(pos); } char consume() { return isEmpty() ? EOF : input.charAt(pos++); } void unconsume() { pos--; } void advance() { pos++; } void mark() { mark = pos; } void rewindToMark() { pos = mark; } String consumeAsString() { return input.substring(pos, pos++); } String consumeTo(char c) { int offset = input.indexOf(c, pos); if (offset != -1) { String consumed = input.substring(pos, offset); pos += consumed.length(); return consumed; } else { return consumeToEnd(); } } String consumeTo(String seq) { int offset = input.indexOf(seq, pos); if (offset != -1) { String consumed = input.substring(pos, offset); pos += consumed.length(); return consumed; } else { return consumeToEnd(); } } String consumeToAny(char... seq) { int start = pos; OUTER: while (!isEmpty()) { char c = input.charAt(pos); for (char seek : seq) { if (seek == c) break OUTER; } pos++; } return pos > start ? input.substring(start, pos) : ""; } String consumeToEnd() { String data = input.substring(pos, input.length() - 1); pos = input.length(); return data; } String consumeLetterSequence() { int start = pos; while (!isEmpty()) { char c = input.charAt(pos); if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> c <= 'z')) pos++; else break; } return input.substring(start, pos); } String consumeHexSequence() { int start = pos; while (!isEmpty()) { char c = input.charAt(pos); if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) pos++; else break; } return input.substring(start, pos); } String consumeDigitSequence() { int start = pos; while (!isEmpty()) { char c = input.charAt(pos); if (c >= '0' && c <= '9') pos++; else break; } return input.substring(start, pos); } boolean matches(char c) { return !isEmpty() && input.charAt(pos) == c; } boolean matches(String seq) { return input.startsWith(seq, pos); } boolean matchesIgnoreCase(String seq) { return input.regionMatches(true, pos, seq, 0, seq.length()); } boolean matchesAny(char... seq) { if (isEmpty()) return false; char c = input.charAt(pos); for (char seek : seq) { if (seek == c) return true; } return false; } boolean matchesLetter() { if (isEmpty()) return false; char c = input.charAt(pos); return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } boolean matchesDigit() { if (isEmpty()) return false; char c = input.charAt(pos); return (c >= '0' && c <= '9'); } boolean matchConsume(String seq) { if (matches(seq)) { pos += seq.length(); return true; } else { return false; } } boolean matchConsumeIgnoreCase(String seq) { if (matchesIgnoreCase(seq)) { pos += seq.length(); return true; } else { return false; } } boolean containsIgnoreCase(String seq) { // used to check presence of </title>, </style>. only finds consistent case. String loScan = seq.toLowerCase(); String hi

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>Scan = seq.toUpperCase(); return (input.indexOf(loScan, pos) > -1) || (input.indexOf(hiScan, pos) > -1); } @Override public String toString() { return input.substring(pos); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; /** A data node, for contents of style, script tags etc, where contents should not show in text(). @author Jonathan Hedley, jonathan@hedley.net */ public class DataNode extends Node{ private static final String DATA_KEY = "data"; /** Create a new DataNode. @param data data contents @param baseUri base URI */ public DataNode(String data, String baseUri) { super(baseUri); attributes.put(DATA_KEY, data); } public String nodeName() { return "#data"; } /** Get the data contents of this node. Will be unescaped and with original new lines, space etc. @return data */ public String getWholeData() { return attributes.get(DATA_KEY); } /** * Set the data contents of this node. * @param data unencoded data * @return this node, for chaining */ public DataNode setWholeData(String data) { attributes.put(DATA_KEY, data); return this; } void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { accum.append(getWholeData()); // data is not escaped in return from data nodes, so " in script, style is plain } void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} public String toString() { return outerHtml(); } /** Create a new DataNode from HTML encoded data. @param encodedData encoded data @param baseUri bass URI @return new DataNode */ public static DataNode createFromEncoded(String encodedData, String baseUri) { String data = Entities.unescape(encodedData); return new DataNode(data, baseUri); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.jsoup.safety.Cleaner; import org.jsoup.safety.Whitelist; import org.jsoup.helper.DataUtil; import org.jsoup.helper.HttpConnection; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; /** The core public access point to the jsoup functionality. @author Jonathan Hedley */ public class Jsoup { private Jsoup() {} /** Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. @param html HTML to parse @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur before the HTML declares a {@code <base href>} tag. @return sane HTML */ public static Document parse(String html, String baseUri) { return Parser.parse(html, baseUri); } /** Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a {@code <base href>} tag. @param html HTML to parse @return sane HTML @see #parse(String, String) */ public static Document parse(String html) { return Parser.parse(html, ""); } /** * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page. * <p> * Use examples: * <ul> * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post(); * </ul> * @param url URL to connect to. The protocol must be {@code http} or {@code https}. * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. */ public static Connection connect(String url) { return HttpConnection.connect(url); } /** Parse the contents of a file

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> as HTML. @param in file to load HTML from @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. */ public static Document parse(File in, String charsetName, String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. @param in file to load HTML from @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. @see #parse(File, String, String) */ public static Document parse(File in, String charsetName) throws IOException { return DataUtil.load(in, charsetName, in.getAbsolutePath()); } /** Read an input stream, and parse it to a Document. @param in input stream to read. Make sure to close it after parsing. @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. */ public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. @param bodyHtml body

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> HTML fragment @param baseUri URL to resolve relative URLs against. @return sane HTML document @see Document#body() */ public static Document parseBodyFragment(String bodyHtml, String baseUri) { return Parser.parseBodyFragment(bodyHtml, baseUri); } /** Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. @param bodyHtml body HTML fragment @return sane HTML document @see Document#body() */ public static Document parseBodyFragment(String bodyHtml) { return Parser.parseBodyFragment(bodyHtml, ""); } /** Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. <p> The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. @return The parsed HTML. @throws IOException If the final server response != 200 OK (redirects are followed), or if there's an error reading the response stream. @see #connect(String) */ public static Document parse(URL url, int timeoutMillis) throws IOException { Connection con = HttpConnection.connect(url); con.timeout(timeoutMillis); return con.get(); } /** Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted tags and attributes. @param bodyHtml input untrusted HMTL @param baseUri URL to resolve relative URLs against @param whitelist white-list of permitted HTML elements @return safe HTML @see Cleaner#clean(Document) */ public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) { Document dirty = parseBodyFragment(bodyHtml, baseUri); Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); return clean.body().html(); } /** Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted tags and attributes. @param bodyHtml input untrusted HTML @

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>param whitelist white-list of permitted HTML elements @return safe HTML @see Cleaner#clean(Document) */ public static String clean(String bodyHtml, Whitelist whitelist) { return clean(bodyHtml, "", whitelist); } /** Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should still be run through the cleaner to set up enforced attributes, and to tidy the output. @param bodyHtml HTML to test @param whitelist whitelist to test against @return true if no tags or attributes were removed; false otherwise @see #clean(String, org.jsoup.safety.Whitelist) */ public static boolean isValid(String bodyHtml, Whitelist whitelist) { Document dirty = parseBodyFragment(bodyHtml, ""); Cleaner cleaner = new Cleaner(whitelist); return cleaner.isValid(dirty); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.select; import org.jsoup.helper.StringUtil; import org.jsoup.nodes.Element; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; /** * Base combining (and, or) evaluator. */ abstract class CombiningEvaluator extends Evaluator { final List<Evaluator> evaluators; CombiningEvaluator() { super(); evaluators = new ArrayList<Evaluator>(); } CombiningEvaluator(Collection<Evaluator> evaluators) { this(); this.evaluators.addAll(evaluators); } static final class And extends CombiningEvaluator { And(Collection<Evaluator> evaluators) { super(evaluators); } And(Evaluator... evaluators) { this(Arrays.asList(evaluators)); } @Override public boolean matches(Element root, Element node) { for (Evaluator s : evaluators) { if (!s.matches(root, node)) return false; } return true; } @Override public String toString() { return StringUtil.join(evaluators, " "); } } static final class Or extends CombiningEvaluator { /** * Create a new Or evaluator. The initial evaluators are ANDed together and used as the first clause of the OR. * @param evaluators initial OR clause (these are wrapped into an AND evaluator). */ Or(Collection<Evaluator> evaluators) { super(); if (evaluators.size() > 1) this.evaluators.add(new And(evaluators)); else // 0 or 1 this.evaluators.addAll(evaluators); } public void add(Evaluator e) { evaluators.add(e); } @Override public boolean matches(Element root, Element node) { for (Evaluator s : evaluators) { if (s.matches(root, node)) return true; } return false; } @Override public String toString() { return String.format(":or%s", evaluators); } } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>BeforeHead); return tb.process(t); } }, BeforeHead { boolean process(Token t, TreeBuilder tb) { if (isWhitespace(t)) { return true; } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { return InBody.process(t, tb); // does not transition } else if (t.isStartTag() && t.asStartTag().name().equals("head")) { Element head = tb.insert(t.asStartTag()); tb.setHeadElement(head); tb.transition(InHead); } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { tb.process(new Token.StartTag("head")); return tb.process(t); } else if (t.isEndTag()) { tb.error(this); return false; } else { tb.process(new Token.StartTag("head")); return tb.process(t); } return true; } }, InHead { boolean process(Token t, TreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); return true; } switch (t.type) { case Comment: tb.insert(t.asComment()); break; case Doctype: tb.error(this); return false; case StartTag: Token.StartTag start = t.asStartTag(); String name = start.name(); if (name.equals("html")) { return InBody.process(t, tb); } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) { Element el = tb.insertEmpty(start); // jsoup special: update base as it is seen. todo: flip to current browser behaviour of one shot if (name.equals("base") && el.hasAttr("href")) tb.setBaseUri(el); } else if (name.equals("meta")) {

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> Element meta = tb.insertEmpty(start); // todo: charset switches } else if (name.equals("title")) { handleRcData(start, tb); } else if (StringUtil.in(name, "noframes", "style")) { handleRawtext(start, tb); } else if (name.equals("noscript")) { // else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript) tb.insert(start); tb.transition(InHeadNoscript); } else if (name.equals("script")) { // skips some script rules as won't execute them tb.insert(start); tb.tokeniser.transition(TokeniserState.ScriptData); tb.markInsertionMode(); tb.transition(Text); } else if (name.equals("head")) { tb.error(this); return false; } else { return anythingElse(t, tb); } break; case EndTag: Token.EndTag end = t.asEndTag(); name = end.name(); if (name.equals("head")) { tb.pop(); tb.transition(AfterHead); } else if (StringUtil.in(name, "body", "html", "br")) { return anythingElse(t, tb); } else { tb.error(this); return false; } break; default: return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { tb.process(new Token.EndTag("head")); return tb.process(t); } }, InHeadNoscript { boolean process(Token t, TreeBuilder tb) { if (t.isDoctype()) { tb.error(this); } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { return tb.process(t, InBody); } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) { tb.pop(); tb.transition(InHead); } else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>basefont", "bgsound", "link", "meta", "noframes", "style"))) { return tb.process(t, InHead); } else if (t.isEndTag() && t.asEndTag().name().equals("br")) { return anythingElse(t, tb); } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), "head", "noscript")) || t.isEndTag()) { tb.error(this); return false; } else { return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { tb.error(this); tb.process(new Token.EndTag("noscript")); return tb.process(t); } }, AfterHead { boolean process(Token t, TreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { tb.error(this); } else if (t.isStartTag()) { Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("html")) { return tb.process(t, InBody); } else if (name.equals("body")) { tb.insert(startTag); tb.framesetOk(false); tb.transition(InBody); } else if (name.equals("frameset")) { tb.insert(startTag); tb.transition(InFrameset); } else if (StringUtil.in(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) { tb.error(this); Element head = tb.getHeadElement(); tb.push(head); tb.process(t, InHead); tb.removeFromStack(head); } else if (name.equals("head")) { tb.error(this); return false; } else { anythingElse(t, tb); } } else if (t.isEndTag()) { if (StringUtil.in(t.asEndTag().name(),

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> "body", "html")) { anythingElse(t, tb); } else { tb.error(this); return false; } } else { anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { tb.process(new Token.StartTag("body")); tb.framesetOk(true); return tb.process(t); } }, InBody { boolean process(Token t, TreeBuilder tb) { switch (t.type) { case Character: { Token.Character c = t.asCharacter(); if (c.getData().equals(nullString)) { // todo confirm that check tb.error(this); return false; } else if (isWhitespace(c)) { tb.reconstructFormattingElements(); tb.insert(c); } else { tb.reconstructFormattingElements(); tb.insert(c); tb.framesetOk(false); } break; } case Comment: { tb.insert(t.asComment()); break; } case Doctype: { tb.error(this); return false; } case StartTag: Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("html")) { tb.error(this); // merge attributes onto real html Element html = tb.getStack().getFirst(); for (Attribute attribute : startTag.getAttributes()) { if (!html.hasAttr(attribute.getKey())) html.attributes().put(attribute); } } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "style", "title")) { return tb.process(t, InHead); } else if (name.equals("body")) { tb.error(this); LinkedList<Element> stack = tb.getStack(); if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { // only in fragment case return false; // ignore } else { tb.framesetOk(false); Element body = stack.get(1); for (Attribute attribute : startTag.getAttributes

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>()) { if (!body.hasAttr(attribute.getKey())) body.attributes().put(attribute); } } } else if (name.equals("frameset")) { tb.error(this); LinkedList<Element> stack = tb.getStack(); if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { // only in fragment case return false; // ignore } else if (!tb.framesetOk()) { return false; // ignore frameset } else { Element second = stack.get(1); if (second.parent() != null) second.remove(); // pop up to html element while (stack.size() > 1) stack.removeLast(); tb.insert(startTag); tb.transition(InFrameset); } } else if (StringUtil.in(name, "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } if (StringUtil.in(tb.currentElement().nodeName(), "h1", "h2", "h3", "h4", "h5", "h6")) { tb.error(this); tb.pop(); } tb.insert(startTag); } else if (StringUtil.in(name, "pre", "listing")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); // todo: ignore LF if next token tb.framesetOk(false); } else if (name.equals("form")) { if (tb.getFormElement() !=

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> null) { tb.error(this); return false; } if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } Element form = tb.insert(startTag); tb.setFormElement(form); } else if (name.equals("li")) { tb.framesetOk(false); LinkedList<Element> stack = tb.getStack(); for (int i = stack.size() - 1; i > 0; i--) { Element el = stack.get(i); if (el.nodeName().equals("li")) { tb.process(new Token.EndTag("li")); break; } if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) break; } if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); } else if (StringUtil.in(name, "dd", "dt")) { tb.framesetOk(false); LinkedList<Element> stack = tb.getStack(); for (int i = stack.size() - 1; i > 0; i--) { Element el = stack.get(i); if (StringUtil.in(el.nodeName(), "dd", "dt")) { tb.process(new Token.EndTag(el.nodeName())); break; } if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) break; } if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); } else if (name.equals("plaintext")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once in, never gets out } else if (name.equals("button")) { if (tb.inButtonScope("button")) { // close and reprocess tb.error(this); tb.process(new Token.EndTag("button")); tb.process(startTag

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>); } else { tb.reconstructFormattingElements(); tb.insert(startTag); tb.framesetOk(false); } } else if (name.equals("a")) { if (tb.getActiveFormattingElement("a") != null) { tb.error(this); tb.process(new Token.EndTag("a")); // still on stack? Element remainingA = tb.getFromStack("a"); if (remainingA != null) { tb.removeFromActiveFormattingElements(remainingA); tb.removeFromStack(remainingA); } } tb.reconstructFormattingElements(); Element a = tb.insert(startTag); tb.pushActiveFormattingElements(a); } else if (StringUtil.in(name, "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u")) { tb.reconstructFormattingElements(); Element el = tb.insert(startTag); tb.pushActiveFormattingElements(el); } else if (name.equals("nobr")) { tb.reconstructFormattingElements(); if (tb.inScope("nobr")) { tb.error(this); tb.process(new Token.EndTag("nobr")); tb.reconstructFormattingElements(); } Element el = tb.insert(startTag); tb.pushActiveFormattingElements(el); } else if (StringUtil.in(name, "applet", "marquee", "object")) { tb.reconstructFormattingElements(); tb.insert(startTag); tb.insertMarkerToFormattingElements(); tb.framesetOk(false); } else if (name.equals("table")) { if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); tb.framesetOk(false); tb.transition(InTable); } else if (StringUtil.in(name, "area", "br", "embed", "img", "keygen", "wbr")) { tb.reconstructFormattingElements(); tb.insertEmpty(startTag); tb.framesetOk(false); } else if (name.equals("

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>input")) { tb.reconstructFormattingElements(); Element el = tb.insertEmpty(startTag); if (!el.attr("type").equalsIgnoreCase("hidden")) tb.framesetOk(false); } else if (StringUtil.in(name, "param", "source", "track")) { tb.insertEmpty(startTag); } else if (name.equals("hr")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insertEmpty(startTag); tb.framesetOk(false); } else if (name.equals("image")) { // we're not supposed to ask. startTag.name("img"); return tb.process(startTag); } else if (name.equals("isindex")) { // how much do we care about the early 90s? tb.error(this); if (tb.getFormElement() != null) return false; tb.tokeniser.acknowledgeSelfClosingFlag(); tb.process(new Token.StartTag("form")); if (startTag.attributes.hasKey("action")) { Element form = tb.getFormElement(); form.attr("action", startTag.attributes.get("action")); } tb.process(new Token.StartTag("hr")); tb.process(new Token.StartTag("label")); // hope you like english. String prompt = startTag.attributes.hasKey("prompt") ? startTag.attributes.get("prompt") : "This is a searchable index. Enter search keywords: "; tb.process(new Token.Character(prompt)); // input Attributes inputAttribs = new Attributes(); for (Attribute attr : startTag.attributes) { if (!StringUtil.in(attr.getKey(), "name", "action", "prompt")) inputAttribs.put(attr); } inputAttribs.put("name", "isindex"); tb.process(new Token.StartTag("input", inputAttribs)); tb.process(new Token.EndTag("label")); tb.process(new Token.StartTag("hr")); tb.process(new Token.EndTag("form")); } else if (name.equals("textarea")) { tb.insert(startTag); // todo: If the next token is a U+000A LINE FEED

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.) tb.tokeniser.transition(TokeniserState.Rcdata); tb.markInsertionMode(); tb.framesetOk(false); tb.transition(Text); } else if (name.equals("xmp")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.reconstructFormattingElements(); tb.framesetOk(false); handleRawtext(startTag, tb); } else if (name.equals("iframe")) { tb.framesetOk(false); handleRawtext(startTag, tb); } else if (name.equals("noembed")) { // also handle noscript if script enabled handleRawtext(startTag, tb); } else if (name.equals("select")) { tb.reconstructFormattingElements(); tb.insert(startTag); tb.framesetOk(false); TreeBuilderState state = tb.state(); if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell)) tb.transition(InSelectInTable); else tb.transition(InSelect); } else if (StringUtil.in("optgroup", "option")) { if (tb.currentElement().nodeName().equals("option")) tb.process(new Token.EndTag("option")); tb.reconstructFormattingElements(); tb.insert(startTag); } else if (StringUtil.in("rp", "rt")) { if (tb.inScope("ruby")) { tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals("ruby")) { tb.error(this); tb.popStackToBefore("ruby"); // i.e. close up to but not include name } tb.insert(startTag); } } else if (name.equals("math")) { tb.reconstructFormattingElements(); // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml) tb.insert(startTag); tb.tokeniser.acknowledgeSelfClosingFlag(); } else if

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> (name.equals("svg")) { tb.reconstructFormattingElements(); // todo: handle A start tag whose tag name is "svg" (xlink, svg) tb.insert(startTag); tb.tokeniser.acknowledgeSelfClosingFlag(); } else if (StringUtil.in(name, "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr")) { tb.error(this); return false; } else { tb.reconstructFormattingElements(); tb.insert(startTag); } break; case EndTag: Token.EndTag endTag = t.asEndTag(); name = endTag.name(); if (name.equals("body")) { if (!tb.inScope("body")) { tb.error(this); return false; } else { // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html tb.transition(AfterBody); } } else if (name.equals("html")) { boolean notIgnored = tb.process(new Token.EndTag("body")); if (notIgnored) return tb.process(endTag); } else if (StringUtil.in(name, "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul")) { // todo: refactor these lookups if (!tb.inScope(name)) { // nothing to close tb.error(this); return false; } else { tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); } } else if (name.equals("form")) { Element currentForm = tb.getFormElement(); tb.setFormElement(null); if (currentForm == null || !tb.inScope(name)) { tb.error(this); return false; } else {

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); // remove currentForm from stack. will shift anything under up. tb.removeFromStack(currentForm); } } else if (name.equals("p")) { if (!tb.inButtonScope(name)) { tb.error(this); tb.process(new Token.StartTag(name)); // if no p to close, creates an empty <p></p> return tb.process(endTag); } else { tb.generateImpliedEndTags(name); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); } } else if (name.equals("li")) { if (!tb.inListItemScope(name)) { tb.error(this); return false; } else { tb.generateImpliedEndTags(name); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); } } else if (StringUtil.in(name, "dd", "dt")) { if (!tb.inScope(name)) { tb.error(this); return false; } else { tb.generateImpliedEndTags(name); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); } } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { if (!tb.inScope(new String[]{"h1", "h2", "h3", "h4", "h5", "h6"})) { tb.error(this); return false; } else { tb.generateImpliedEndTags(name); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); } } else if (name.equals("sarcasm")) { // *sigh* return anyOtherEndTag(t, tb); } else if (StringUtil.

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> } else { return anyOtherEndTag(t, tb); } break; case EOF: // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html // stop parsing break; } return true; } boolean anyOtherEndTag(Token t, TreeBuilder tb) { String name = t.asEndTag().name(); LinkedList<Element> stack = tb.getStack(); Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element node = it.next(); if (node.nodeName().equals(name)) { tb.generateImpliedEndTags(name); if (!name.equals(tb.currentElement().nodeName())) tb.error(this); tb.popStackToClose(name); break; } else { if (tb.isSpecial(node)) { tb.error(this); return false; } } } return true; } }, Text { // in script, style etc. normally treated as data tags boolean process(Token t, TreeBuilder tb) { if (t.isCharacter()) { tb.insert(t.asCharacter()); } else if (t.isEOF()) { tb.error(this); // if current node is script: already started tb.pop(); tb.transition(tb.originalState()); return tb.process(t); } else if (t.isEndTag()) { // if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts tb.pop(); tb.transition(tb.originalState()); } return true; } }, InTable { boolean process(Token t, TreeBuilder tb) { if (t.isCharacter()) { tb.newPendingTableCharacters(); tb.markInsertionMode(); tb.transition(InTableText); return tb.process(t); } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag()) { Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> (name.equals("caption")) { tb.clearStackToTableContext(); tb.insertMarkerToFormattingElements(); tb.insert(startTag); tb.transition(InCaption); } else if (name.equals("colgroup")) { tb.clearStackToTableContext(); tb.insert(startTag); tb.transition(InColumnGroup); } else if (name.equals("col")) { tb.process(new Token.StartTag("colgroup")); return tb.process(t); } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { tb.clearStackToTableContext(); tb.insert(startTag); tb.transition(InTableBody); } else if (StringUtil.in(name, "td", "th", "tr")) { tb.process(new Token.StartTag("tbody")); return tb.process(t); } else if (name.equals("table")) { tb.error(this); boolean processed = tb.process(new Token.EndTag("table")); if (processed) // only ignored if in fragment return tb.process(t); } else if (StringUtil.in(name, "style", "script")) { return tb.process(t, InHead); } else if (name.equals("input")) { if (!startTag.attributes.get("type").equalsIgnoreCase("hidden")) { return anythingElse(t, tb); } else { tb.insertEmpty(startTag); } } else if (name.equals("form")) { tb.error(this); if (tb.getFormElement() != null) return false; else { Element form = tb.insertEmpty(startTag); tb.setFormElement(form); } } else { return anythingElse(t, tb); } } else if (t.isEndTag()) { Token.EndTag endTag = t.asEndTag(); String name = endTag.name(); if (name.equals("table")) { if (!tb.inTableScope(name)) { tb.error(this); return false; } else { tb.popStackToClose("table"); } tb.resetInsertionMode(); } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "tbody",

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> "td", "tfoot", "th", "thead", "tr")) { tb.error(this); return false; } else { return anythingElse(t, tb); } } else if (t.isEOF()) { if (tb.currentElement().nodeName().equals("html")) tb.error(this); return true; // stops parsing } return anythingElse(t, tb); } boolean anythingElse(Token t, TreeBuilder tb) { tb.error(this); boolean processed = true; if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { tb.setFosterInserts(true); processed = tb.process(t, InBody); tb.setFosterInserts(false); } else { processed = tb.process(t, InBody); } return processed; } }, InTableText { boolean process(Token t, TreeBuilder tb) { switch (t.type) { case Character: Token.Character c = t.asCharacter(); if (c.getData().equals(nullString)) { tb.error(this); return false; } else { tb.getPendingTableCharacters().add(c); } break; default: if (tb.getPendingTableCharacters().size() > 0) { for (Token.Character character : tb.getPendingTableCharacters()) { if (!isWhitespace(character)) { // InTable anything else section: tb.error(this); if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { tb.setFosterInserts(true); tb.process(character, InBody); tb.setFosterInserts(false); } else { tb.process(character, InBody); } } else tb.insert(character); } tb.newPendingTableCharacters(); } tb.transition(tb.originalState()); return tb.process(t); } return true; } }, InCaption { boolean process(Token t, TreeBuilder tb) { if (t.isEndTag() && t.asEndTag().name().equals("caption")) { Token.EndTag endTag = t.

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>asEndTag(); String name = endTag.name(); if (!tb.inTableScope(name)) { tb.error(this); return false; } else { tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals("caption")) tb.error(this); tb.popStackToClose("caption"); tb.clearFormattingElementsToLastMarker(); tb.transition(InTable); } } else if (( t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") || t.isEndTag() && t.asEndTag().name().equals("table")) ) { tb.error(this); boolean processed = tb.process(new Token.EndTag("caption")); if (processed) return tb.process(t); } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { tb.error(this); return false; } else { return tb.process(t, InBody); } return true; } }, InColumnGroup { boolean process(Token t, TreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); return true; } switch (t.type) { case Comment: tb.insert(t.asComment()); break; case Doctype: tb.error(this); break; case StartTag: Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("html")) return tb.process(t, InBody); else if (name.equals("col")) tb.insertEmpty(startTag); else return anythingElse(t, tb); break; case EndTag: Token.EndTag endTag = t.asEndTag(); name = endTag.name(); if (name.equals("colgroup")) { if (tb.currentElement().nodeName().equals("html")) { // frag case tb.error(this);

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> return false; } else { tb.pop(); tb.transition(InTable); } } else return anythingElse(t, tb); break; case EOF: if (tb.currentElement().nodeName().equals("html")) return true; // stop parsing; frag case else return anythingElse(t, tb); default: return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { boolean processed = tb.process(new Token.EndTag("colgroup")); if (processed) // only ignored in frag case return tb.process(t); return true; } }, InTableBody { boolean process(Token t, TreeBuilder tb) { switch (t.type) { case StartTag: Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("tr")) { tb.clearStackToTableBodyContext(); tb.insert(startTag); tb.transition(InRow); } else if (StringUtil.in(name, "th", "td")) { tb.error(this); tb.process(new Token.StartTag("tr")); return tb.process(startTag); } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) { return exitTableBody(t, tb); } else return anythingElse(t, tb); break; case EndTag: Token.EndTag endTag = t.asEndTag(); name = endTag.name(); if (StringUtil.in(name, "tbody", "tfoot", "thead")) { if (!tb.inTableScope(name)) { tb.error(this); return false; } else { tb.clearStackToTableBodyContext(); tb.pop(); tb.transition(InTable); } } else if (name.equals("table")) { return exitTableBody(t, tb); } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) { tb.error(this); return false; } else return anythingElse(t, tb); break; default: return anythingElse

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>(t, tb); } return true; } private boolean exitTableBody(Token t, TreeBuilder tb) { if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb.inScope("tfoot"))) { // frag case tb.error(this); return false; } tb.clearStackToTableBodyContext(); tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, tfoot, thead return tb.process(t); } private boolean anythingElse(Token t, TreeBuilder tb) { return tb.process(t, InTable); } }, InRow { boolean process(Token t, TreeBuilder tb) { if (t.isStartTag()) { Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (StringUtil.in(name, "th", "td")) { tb.clearStackToTableRowContext(); tb.insert(startTag); tb.transition(InCell); tb.insertMarkerToFormattingElements(); } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) { return handleMissingTr(t, tb); } else { return anythingElse(t, tb); } } else if (t.isEndTag()) { Token.EndTag endTag = t.asEndTag(); String name = endTag.name(); if (name.equals("tr")) { if (!tb.inTableScope(name)) { tb.error(this); // frag return false; } tb.clearStackToTableRowContext(); tb.pop(); // tr tb.transition(InTableBody); } else if (name.equals("table")) { return handleMissingTr(t, tb); } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { if (!tb.inTableScope(name)) { tb.error(this); return false; } tb.process(new Token.EndTag("tr")); return tb.process(t); } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th")) { tb.error(this); return

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> false; } else { return anythingElse(t, tb); } } else { return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { return tb.process(t, InTable); } private boolean handleMissingTr(Token t, TreeBuilder tb) { boolean processed = tb.process(new Token.EndTag("tr")); if (processed) return tb.process(t); else return false; } }, InCell { boolean process(Token t, TreeBuilder tb) { if (t.isEndTag()) { Token.EndTag endTag = t.asEndTag(); String name = endTag.name(); if (StringUtil.in(name, "td", "th")) { if (!tb.inTableScope(name)) { tb.error(this); tb.transition(InRow); // might not be in scope if empty: <td /> and processing fake end tag return false; } tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); tb.clearFormattingElementsToLastMarker(); tb.transition(InRow); } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html")) { tb.error(this); return false; } else if (StringUtil.in(name, "table", "tbody", "tfoot", "thead", "tr")) { if (!tb.inTableScope(name)) { tb.error(this); return false; } closeCell(tb); return tb.process(t); } else { return anythingElse(t, tb); } } else if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) { if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { tb.error(this); return false; } closeCell(tb); return tb.process(t); } else { return anythingElse(t, tb); } return true

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>; } private boolean anythingElse(Token t, TreeBuilder tb) { return tb.process(t, InBody); } private void closeCell(TreeBuilder tb) { if (tb.inTableScope("td")) tb.process(new Token.EndTag("td")); else tb.process(new Token.EndTag("th")); // only here if th or td in scope } }, InSelect { boolean process(Token t, TreeBuilder tb) { switch (t.type) { case Character: Token.Character c = t.asCharacter(); if (c.getData().equals(nullString)) { tb.error(this); return false; } else { tb.insert(c); } break; case Comment: tb.insert(t.asComment()); break; case Doctype: tb.error(this); return false; case StartTag: Token.StartTag start = t.asStartTag(); String name = start.name(); if (name.equals("html")) return tb.process(start, InBody); else if (name.equals("option")) { tb.process(new Token.EndTag("option")); tb.insert(start); } else if (name.equals("optgroup")) { if (tb.currentElement().nodeName().equals("option")) tb.process(new Token.EndTag("option")); else if (tb.currentElement().nodeName().equals("optgroup")) tb.process(new Token.EndTag("optgroup")); tb.insert(start); } else if (name.equals("select")) { tb.error(this); return tb.process(new Token.EndTag("select")); } else if (StringUtil.in(name, "input", "keygen", "textarea")) { tb.error(this); if (!tb.inSelectScope("select")) return false; // frag tb.process(new Token.EndTag("select")); return tb.process(start); } else if (name.equals("script")) { return tb.process(t, InHead); } else { return anythingElse(t, tb); } break; case EndTag: Token.EndTag end = t.asEndTag(); name = end.name(); if (name.equals("

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>optgroup")) { if (tb.currentElement().nodeName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nodeName().equals("optgroup")) tb.process(new Token.EndTag("option")); if (tb.currentElement().nodeName().equals("optgroup")) tb.pop(); else tb.error(this); } else if (name.equals("option")) { if (tb.currentElement().nodeName().equals("option")) tb.pop(); else tb.error(this); } else if (name.equals("select")) { if (!tb.inSelectScope(name)) { tb.error(this); return false; } else { tb.popStackToClose(name); tb.resetInsertionMode(); } } else return anythingElse(t, tb); break; case EOF: if (!tb.currentElement().nodeName().equals("html")) tb.error(this); break; default: return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { tb.error(this); return false; } }, InSelectInTable { boolean process(Token t, TreeBuilder tb) { if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { tb.error(this); tb.process(new Token.EndTag("select")); return tb.process(t); } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { tb.error(this); if (tb.inTableScope(t.asEndTag().name())) { tb.process(new Token.EndTag("select")); return (tb.process(t)); } else return false; } else { return tb.process(t, InSelect); } } }, AfterBody { boolean process(Token t, TreeBuilder tb) { if (isWhitespace(t)) { return tb.process

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>(t, InBody); } else if (t.isComment()) { tb.insert(t.asComment()); // into html node } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { return tb.process(t, InBody); } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { if (tb.isFragmentParsing()) { tb.error(this); return false; } else { tb.transition(AfterAfterBody); } } else if (t.isEOF()) { // chillax! we're done } else { tb.error(this); tb.transition(InBody); return tb.process(t); } return true; } }, InFrameset { boolean process(Token t, TreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag()) { Token.StartTag start = t.asStartTag(); String name = start.name(); if (name.equals("html")) { return tb.process(start, InBody); } else if (name.equals("frameset")) { tb.insert(start); } else if (name.equals("frame")) { tb.insertEmpty(start); } else if (name.equals("noframes")) { return tb.process(start, InHead); } else { tb.error(this); return false; } } else if (t.isEndTag() && t.asEndTag().name().equals("frameset")) { if (tb.currentElement().nodeName().equals("html")) { // frag tb.error(this); return false; } else { tb.pop(); if (!tb.isFragmentParsing() && !tb.currentElement().nodeName().equals("frameset")) { tb.transition(AfterFrameset); } } } else if (t.is

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> work chuck } else if (t.isStartTag() && t.asStartTag().name().equals("nofrmes")) { return tb.process(t, InHead); } else { tb.error(this); tb.transition(InBody); return tb.process(t); } return true; } }, ForeignContent { boolean process(Token t, TreeBuilder tb) { return true; // todo: implement. Also; how do we get here? } }; private static String nullString = String.valueOf(0x0000); abstract boolean process(Token t, TreeBuilder tb); private static boolean isWhitespace(Token t) { if (t.isCharacter()) { String data = t.asCharacter().getData(); // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " for (int i = 0; i < data.length(); i++) { char c = data.charAt(i); if (!Character.isWhitespace(c)) return false; } return true; } return false; } private static void handleRcData(Token.StartTag startTag, TreeBuilder tb) { tb.insert(startTag); tb.tokeniser.transition(TokeniserState.Rcdata); tb.markInsertionMode(); tb.transition(Text); } private static void handleRawtext(Token.StartTag startTag, TreeBuilder tb) { tb.insert(startTag); tb.tokeniser.transition(TokeniserState.Rawtext); tb.markInsertionMode(); tb.transition(Text); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; import org.jsoup.helper.Validate; /** * A character queue with parsing helpers. * * @author Jonathan Hedley */ public class TokenQueue { private String queue; private int pos = 0; private static final char ESC = '\\'; // escape char for chomp balanced. /** Create a new TokenQueue. @param data string of data to back queue. */ public TokenQueue(String data) { Validate.notNull(data); queue = data; } /** * Is the queue empty? * @return true if no data left in queue. */ public boolean isEmpty() { return remainingLength() == 0; } private int remainingLength() { return queue.length() - pos; } /** * Retrieves but does not remove the first character from the queue. * @return First character, or 0 if empty. */ public char peek() { return isEmpty() ? 0 : queue.charAt(pos); } /** Add a character to the start of the queue (will be the next character retrieved). @param c character to add */ public void addFirst(Character c) { addFirst(c.toString()); } /** Add a string to the start of the queue. @param seq string to add. */ public void addFirst(String seq) { // not very performant, but an edge case queue = seq + queue.substring(pos); pos = 0; } /** * Tests if the next characters on the queue match the sequence. Case insensitive. * @param seq String to check queue for. * @return true if the next characters match. */ public boolean matches(String seq) { return queue.regionMatches(true, pos, seq, 0, seq.length()); } /** * Case sensitive match test. * @param seq * @return */ public boolean matchesCS(String seq) { return queue.startsWith(seq, pos); } /** Tests if the next characters match any of the sequences. Case insensitive. @param seq @return */ public boolean matchesAny(String... seq) { for (String s : seq) { if (matches(s)) return true; } return false; }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> public boolean matchesAny(char... seq) { if (isEmpty()) return false; for (char c: seq) { if (queue.charAt(pos) == c) return true; } return false; } public boolean matchesStartTag() { // micro opt for matching "<x" return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1))); } /** * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the * queue. * @param seq String to search for, and if found, remove from queue. * @return true if found and removed, false if not found. */ public boolean matchChomp(String seq) { if (matches(seq)) { pos += seq.length(); return true; } else { return false; } } /** Tests if queue starts with a whitespace character. @return if starts with whitespace */ public boolean matchesWhitespace() { return !isEmpty() && Character.isWhitespace(queue.charAt(pos)); } /** Test if the queue matches a word character (letter or digit). @return if matches a word character */ public boolean matchesWord() { return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); } /** * Drops the next character off the queue. */ public void advance() { if (!isEmpty()) pos++; } /** * Consume one character off queue. * @return first character on queue. */ public char consume() { return queue.charAt(pos++); } /** * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will * throw an illegal state exception -- but you should be running match() against that condition. <p> Case insensitive. * @param seq sequence to remove from head of queue. */ public void consume(String seq) { if (!matches(seq)) throw new IllegalStateException("Queue did not match expected sequence"); int len = seq.length(); if (len > remainingLength()) throw new IllegalStateException("Queue not long enough to consume sequence"); pos += len; } /** * Pulls a string off the queue, up to but exclusive of the match

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> sequence, or to the queue running out. * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b> * @return The matched data consumed from queue. */ public String consumeTo(String seq) { int offset = queue.indexOf(seq, pos); if (offset != -1) { String consumed = queue.substring(pos, offset); pos += consumed.length(); return consumed; } else { return remainder(); } } public String consumeToIgnoreCase(String seq) { int start = pos; String first = seq.substring(0, 1); boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of while (!isEmpty()) { if (matches(seq)) break; if (canScan) { int skip = queue.indexOf(first, pos) - pos; if (skip == 0) // this char is the skip char, but not match, so force advance of pos pos++; else if (skip < 0) // no chance of finding, grab to end pos = queue.length(); else pos += skip; } else pos++; } String data = queue.substring(start, pos); return data; } /** Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. @param seq any number of terminators to consume to. <b>Case insensitive.</b> @return consumed string */ // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this // is is a case sensitive time... public String consumeToAny(String... seq) { int start = pos; while (!isEmpty() && !matchesAny(seq)) { pos++; } String data = queue.substring(start, pos); return data; } /** * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). * <p> * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go * isEmpty() == true). * @param seq String to match up to, and not include

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> in return, and to pull off queue. <b>Case sensitive.</b> * @return Data matched from queue. */ public String chompTo(String seq) { String data = consumeTo(seq); matchChomp(seq); return data; } public String chompToIgnoreCase(String seq) { String data = consumeToIgnoreCase(seq); // case insensitive scan matchChomp(seq); return data; } /** * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for * contains text strings; use unescape for that. * @param open opener * @param close closer * @return data matched from the queue */ public String chompBalanced(char open, char close) { StringBuilder accum = new StringBuilder(); int depth = 0; char last = 0; do { if (isEmpty()) break; Character c = consume(); if (last == 0 || last != ESC) { if (c.equals(open)) depth++; else if (c.equals(close)) depth--; } if (depth > 0 && last != 0) accum.append(c); // don't include the outer match pair in the return last = c; } while (depth > 0); return accum.toString(); } /** * Unescaped a \ escaped string. * @param in backslash escaped string * @return unescaped string */ public static String unescape(String in) { StringBuilder out = new StringBuilder(); char last = 0; for (char c : in.toCharArray()) { if (c == ESC) { if (last != 0 && last == ESC) out.append(c); } else out.append(c); last = c; } return out.toString(); } /** * Pulls the next run of whitespace characters of the queue. */ public boolean consumeWhitespace() { boolean seen = false;

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> while (matchesWhitespace()) { pos++; seen = true; } return seen; } /** * Retrieves the next run of word type (letter or digit) off the queue. * @return String of word characters from queue, or empty string if none. */ public String consumeWord() { int start = pos; while (matchesWord()) pos++; return queue.substring(start, pos); } /** * Consume an tag name off the queue (word or :, _, -) * * @return tag name */ public String consumeTagName() { int start = pos; while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) pos++; return queue.substring(start, pos); } /** * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects). * * @return tag name */ public String consumeElementSelector() { int start = pos; while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) pos++; return queue.substring(start, pos); } /** Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier @return identifier */ public String consumeCssIdentifier() { int start = pos; while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) pos++; return queue.substring(start, pos); } /** Consume an attribute key off the queue (letter, digit, -, _, :") @return attribute key */ public String consumeAttributeKey() { int start = pos; while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) pos++; return queue.substring(start, pos); } /** Consume and return whatever is left on the queue. @return remained of queue. */ public String remainder() { StringBuilder accum = new StringBuilder(); while (!isEmpty()) { accum.append(consume()); } return accum.toString(); } public String toString() { return queue.substring(pos); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; import org.jsoup.Jsoup; import org.jsoup.TextUtil; import org.jsoup.nodes.Comment; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import org.junit.Test; import java.util.List; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; /** Tests for the Parser @author Jonathan Hedley, jonathan@hedley.net */ public class ParserTest { @Test public void parsesSimpleDocument() { String html = "<html><head><title>First!</title></head><body><p>First post! <img src=\"foo.png\" /></p></body></html>"; Document doc = Jsoup.parse(html); // need a better way to verify these: Element p = doc.body().child(0); assertEquals("p", p.tagName()); Element img = p.child(0); assertEquals("foo.png", img.attr("src")); assertEquals("img", img.tagName()); } @Test public void parsesRoughAttributes() { String html = "<html><head><title>First!</title></head><body><p class=\"foo > bar\">First post! <img src=\"foo.png\" /></p></body></html>"; Document doc = Jsoup.parse(html); // need a better way to verify these: Element p = doc.body().child(0); assertEquals("p", p.tagName()); assertEquals("foo > bar", p.attr("class")); } @Test public void parsesQuiteRoughAttributes() { String html = "<p =a>One<a <p>Something</p>Else"; // this gets a <p> with attr '=a' and an <a tag with an attribue named '<p'; and then auto-recreated Document doc = Jsoup.parse(html); assertEquals("<p =a=\"\">One<a <p=\"\">Something</a></p>\n" + "<a <p=\"\">Else</a>", doc.body().html()); doc = Jsoup.parse("<p .....>"); assertEquals("<p .....=\"\"></p>", doc.

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>body().html()); } @Test public void parsesComments() { String html = "<html><head></head><body><img src=foo><!-- <table><tr><td></table> --><p>Hello</p></body></html>"; Document doc = Jsoup.parse(html); Element body = doc.body(); Comment comment = (Comment) body.childNode(1); // comment should not be sub of img, as it's an empty tag assertEquals(" <table><tr><td></table> ", comment.getData()); Element p = body.child(1); TextNode text = (TextNode) p.childNode(0); assertEquals("Hello", text.getWholeText()); } @Test public void parsesUnterminatedComments() { String html = "<p>Hello<!-- <tr><td>"; Document doc = Jsoup.parse(html); Element p = doc.getElementsByTag("p").get(0); assertEquals("Hello", p.text()); TextNode text = (TextNode) p.childNode(0); assertEquals("Hello", text.getWholeText()); Comment comment = (Comment) p.childNode(1); assertEquals(" <tr><td>", comment.getData()); } @Test public void dropsUnterminatedTag() { // jsoup used to parse this to <p>, but whatwg, webkit will drop. String h1 = "<p"; Document doc = Jsoup.parse(h1); assertEquals(0, doc.getElementsByTag("p").size()); assertEquals("", doc.text()); String h2 = "<div id=1<p id='2'"; doc = Jsoup.parse(h2); assertEquals("", doc.text()); } @Test public void dropsUnterminatedAttribute() { // jsoup used to parse this to <p id="foo">, but whatwg, webkit will drop. String h1 = "<p id=\"foo"; Document doc = Jsoup.parse(h1); assertEquals("", doc.text()); } @Test public void parsesUnterminatedTextarea() { // don't parse right to end, but break on <p> Document doc = Jsoup.parse("<body><p><textarea>one<p>two"); Element t = doc.select("textarea").first(); assertEquals("one", t.text()); assertEquals("two", doc.select("p").get(

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>1).text()); } @Test public void parsesUnterminatedOption() { // bit weird this -- browsers and spec get stuck in select until there's a </select> Document doc = Jsoup.parse("<body><p><select><option>One<option>Two</p><p>Three</p>"); Elements options = doc.select("option"); assertEquals(2, options.size()); assertEquals("One", options.first().text()); assertEquals("TwoThree", options.last().text()); } @Test public void testSpaceAfterTag() { Document doc = Jsoup.parse("<div > <a name=\"top\"></a ><p id=1 >Hello</p></div>"); assertEquals("<div> <a name=\"top\"></a><p id=\"1\">Hello</p></div>", TextUtil.stripNewlines(doc.body().html())); } @Test public void createsDocumentStructure() { String html = "<meta name=keywords /><link rel=stylesheet /><title>jsoup</title><p>Hello world</p>"; Document doc = Jsoup.parse(html); Element head = doc.head(); Element body = doc.body(); assertEquals(1, doc.children().size()); // root node: contains html node assertEquals(2, doc.child(0).children().size()); // html node: head and body assertEquals(3, head.children().size()); assertEquals(1, body.children().size()); assertEquals("keywords", head.getElementsByTag("meta").get(0).attr("name")); assertEquals(0, body.getElementsByTag("meta").size()); assertEquals("jsoup", doc.title()); assertEquals("Hello world", body.text()); assertEquals("Hello world", body.children().get(0).text()); } @Test public void createsStructureFromBodySnippet() { // the bar baz stuff naturally goes into the body, but the 'foo' goes into root, and the normalisation routine // needs to move into the start of the body String html = "foo <b>bar</b> baz"; Document doc = Jsoup.parse(html); assertEquals("foo bar baz", doc.text()); } @Test public void handlesEscapedData() { String html = "<div title='Surf &amp; Turf'>Reef &amp; Beef</div>"; Document doc = Jsoup.parse(html); Element

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> div = doc.getElementsByTag("div").get(0); assertEquals("Surf & Turf", div.attr("title")); assertEquals("Reef & Beef", div.text()); } @Test public void handlesDataOnlyTags() { String t = "<style>font-family: bold</style>"; List<Element> tels = Jsoup.parse(t).getElementsByTag("style"); assertEquals("font-family: bold", tels.get(0).data()); assertEquals("", tels.get(0).text()); String s = "<p>Hello</p><script>Nope</script><p>There</p>"; Document doc = Jsoup.parse(s); assertEquals("Hello There", doc.text()); assertEquals("Nope", doc.data()); } @Test public void handlesTextAfterData() { String h = "<html><body>pre <script>inner</script> aft</body></html>"; Document doc = Jsoup.parse(h); assertEquals("<html><head></head><body>pre <script>inner</script> aft</body></html>", TextUtil.stripNewlines(doc.html())); } @Test public void handlesTextArea() { Document doc = Jsoup.parse("<textarea>Hello</textarea>"); Elements els = doc.select("textarea"); assertEquals("Hello", els.text()); assertEquals("Hello", els.val()); } @Test public void doesNotCreateImplicitLists() { // old jsoup used to wrap this in <ul>, but that's not to spec String h = "<li>Point one<li>Point two"; Document doc = Jsoup.parse(h); Elements ol = doc.select("ul"); // should NOT have created a default ul. assertEquals(0, ol.size()); Elements lis = doc.select("li"); assertEquals(2, lis.size()); assertEquals("body", lis.first().parent().tagName()); // no fiddling with non-implicit lists String h2 = "<ol><li><p>Point the first<li><p>Point the second"; Document doc2 = Jsoup.parse(h2); assertEquals(0, doc2.select("ul").size()); assertEquals(1, doc2.select("ol").size()); assertEquals(2, doc2.select("ol li").size()); assertEquals(2, doc

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>2.select("ol li p").size()); assertEquals(1, doc2.select("ol li").get(0).children().size()); // one p in first li } @Test public void discardsNakedTds() { // jsoup used to make this into an implicit table; but browsers make it into a text run String h = "<td>Hello<td><p>There<p>now"; Document doc = Jsoup.parse(h); assertEquals("Hello<p>There</p><p>now</p>", TextUtil.stripNewlines(doc.body().html())); // <tbody> is introduced if no implicitly creating table, but allows tr to be directly under table } @Test public void handlesNestedImplicitTable() { Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>"); assertEquals("<table><tbody><tr><td>1</td></tr> <tr><td>2</td></tr> <tr><td> <table><tbody><tr><td>3</td> <td>4</td></tr></tbody></table> </td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); } @Test public void handlesWhatWgExpensesTableExample() { // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0 Document doc = Jsoup.parse("<table> <colgroup> <col> <colgroup> <col> <col> <col> <thead> <tr> <th> <th>2008 <th>2007 <th>2006 <tbody> <tr> <th scope=rowgroup> Research and development <td> $ 1,109 <td> $ 782 <td> $ 712 <tr> <th scope=row> Percentage of net sales <td> 3.4% <td> 3.3% <td> 3.7% <tbody> <tr> <th scope=rowgroup> Selling, general, and administrative <td> $ 3,7

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>tbody></table>", TextUtil.stripNewlines(doc.body().html())); } @Test public void noTableDirectInTable() { Document doc = Jsoup.parse("<table> <td>One <td><table><td>Two</table> <table><td>Three"); assertEquals("<table> <tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table> <table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); } @Test public void ignoresDupeEndTrTag() { Document doc = Jsoup.parse("<table><tr><td>One</td><td><table><tr><td>Two</td></tr></tr></table></td><td>Three</td></tr></table>"); // two </tr></tr>, must ignore or will close table assertEquals("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table></td><td>Three</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); } @Test public void handlesBaseTags() { // todo -- don't handle base tags like this -- spec and browsers don't (any more -- v. old ones do). // instead, just maintain one baseUri in the doc String h = "<a href=1>#</a><base href='/2/'><a href='3'>#</a><base href='http://bar'><a href=4>#</a>"; Document doc = Jsoup.parse(h, "http://foo/"); assertEquals("http://bar", doc.baseUri()); // gets updated as base changes, so doc.createElement has latest. Elements anchors = doc.getElementsByTag("a"); assertEquals(3, anchors.size()); assertEquals("http://foo/", anchors.get(0).baseUri()); assertEquals("http://foo/2/", anchors.get(1).baseUri()); assertEquals("http://bar", anchors.get(2).baseUri()); assertEquals("http://foo/1", anchors.get(0).absUrl("href")); assertEquals("http://foo/2/3", anchors.get(1).absUrl("href")); assertEquals("

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>http://bar/4", anchors.get(2).absUrl("href")); } @Test public void handlesCdata() { // todo: as this is html namespace, should actually treat as bogus comment, not cdata. keep as cdata for now String h = "<div id=1><![CDATA[<html>\n<foo><&amp;]]></div>"; // the &amp; in there should remain literal Document doc = Jsoup.parse(h); Element div = doc.getElementById("1"); assertEquals("<html> <foo><&amp;", div.text()); assertEquals(0, div.children().size()); assertEquals(1, div.childNodes().size()); // no elements, one text node } @Test public void handlesInvalidStartTags() { String h = "<div>Hello < There <&amp;></div>"; // parse to <div {#text=Hello < There <&>}> Document doc = Jsoup.parse(h); assertEquals("Hello < There <&>", doc.select("div").first().text()); } @Test public void handlesUnknownTags() { String h = "<div><foo title=bar>Hello<foo title=qux>there</foo></div>"; Document doc = Jsoup.parse(h); Elements foos = doc.select("foo"); assertEquals(2, foos.size()); assertEquals("bar", foos.first().attr("title")); assertEquals("qux", foos.last().attr("title")); assertEquals("there", foos.last().text()); } @Test public void handlesUnknownInlineTags() { String h = "<p><cust>Test</cust></p><p><cust><cust>Test</cust></cust></p>"; Document doc = Jsoup.parseBodyFragment(h); String out = doc.body().html(); assertEquals(h, TextUtil.stripNewlines(out)); } @Test public void parsesBodyFragment() { String h = "<!-- comment --><p><a href='foo'>One</a></p>"; Document doc = Jsoup.parseBodyFragment(h, "http://example.com"); assertEquals("<body><!-- comment --><p><a href=\"foo\">One</a></p></body>", TextUtil.stripNewlines(doc.body().outerHtml())); assertEquals("http://example.com/foo", doc.select("a").

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>first().absUrl("href")); } @Test public void handlesUnknownNamespaceTags() { // note that the first foo:bar should not really be allowed to be self closing, if parsed in html mode. String h = "<foo:bar id='1' /><abc:def id=2>Foo<p>Hello</p></abc:def><foo:bar>There</foo:bar>"; Document doc = Jsoup.parse(h); assertEquals("<foo:bar id=\"1\" /><abc:def id=\"2\">Foo<p>Hello</p></abc:def><foo:bar>There</foo:bar>", TextUtil.stripNewlines(doc.body().html())); } @Test public void handlesKnownEmptyBlocks() { // if known tag, must be defined as self closing to allow as self closing. unkown tags can be self closing. String h = "<div id='1' /><div id=2><img /><img></div> <hr /> hr text <hr> hr text two"; Document doc = Jsoup.parse(h); Element div1 = doc.getElementById("1"); assertTrue(!div1.children().isEmpty()); // <div /> is treated as <div>... assertTrue(doc.select("hr").first().children().isEmpty()); assertTrue(doc.select("hr").last().children().isEmpty()); assertTrue(doc.select("img").first().children().isEmpty()); assertTrue(doc.select("img").last().children().isEmpty()); } @Test public void handlesSolidusAtAttributeEnd() { // this test makes sure [<a href=/>link</a>] is parsed as [<a href="/">link</a>], not [<a href="" /><a>link</a>] String h = "<a href=/>link</a>"; Document doc = Jsoup.parse(h); assertEquals("<a href=\"/\">link</a>", doc.body().html()); } @Test public void handlesMultiClosingBody() { String h = "<body><p>Hello</body><p>there</p></body></body></html><p>now"; Document doc = Jsoup.parse(h); assertEquals(3, doc.select("p").size()); assertEquals(3, doc.body().children().size()); } @Test public void handlesUnclosedDefinitionLists() { // jsoup used to create a <dl>, but that's not

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> to spec String h = "<dt>Foo<dd>Bar<dt>Qux<dd>Zug"; Document doc = Jsoup.parse(h); assertEquals(0, doc.select("dl").size()); // no auto dl assertEquals(4, doc.select("dt, dd").size()); Elements dts = doc.select("dt"); assertEquals(2, dts.size()); assertEquals("Zug", dts.get(1).nextElementSibling().text()); } @Test public void handlesBlocksInDefinitions() { // per the spec, dt and dd are inline, but in practise are block String h = "<dl><dt><div id=1>Term</div></dt><dd><div id=2>Def</div></dd></dl>"; Document doc = Jsoup.parse(h); assertEquals("dt", doc.select("#1").first().parent().tagName()); assertEquals("dd", doc.select("#2").first().parent().tagName()); assertEquals("<dl><dt><div id=\"1\">Term</div></dt><dd><div id=\"2\">Def</div></dd></dl>", TextUtil.stripNewlines(doc.body().html())); } @Test public void handlesFrames() { String h = "<html><head><script></script><noscript></noscript></head><frameset><frame src=foo></frame><frame src=foo></frameset></html>"; Document doc = Jsoup.parse(h); assertEquals("<html><head><script></script><noscript></noscript></head><frameset><frame src=\"foo\" /><frame src=\"foo\" /></frameset></html>", TextUtil.stripNewlines(doc.html())); // no body auto vivification } @Test public void handlesJavadocFont() { String h = "<TD BGCOLOR=\"#EEEEFF\" CLASS=\"NavBarCell1\"> <A HREF=\"deprecated-list.html\"><FONT CLASS=\"NavBarFont1\"><B>Deprecated</B></FONT></A>&nbsp;</TD>"; Document doc = Jsoup.parse(h); Element a = doc.select("a").first(); assertEquals("Deprecated", a.text()); assertEquals("font", a.child(0).tagName()); assertEquals("b", a.child(0).child(0).tagName()); } @Test public void handlesBaseWithoutHref() { String h = "<head><base target='_blank'></head><body><a

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> href=/foo>Test</a></body>"; Document doc = Jsoup.parse(h, "http://example.com/"); Element a = doc.select("a").first(); assertEquals("/foo", a.attr("href")); assertEquals("http://example.com/foo", a.attr("abs:href")); } @Test public void normalisesDocument() { String h = "<!doctype html>One<html>Two<head>Three<link></head>Four<body>Five </body>Six </html>Seven "; Document doc = Jsoup.parse(h); assertEquals("<!DOCTYPE html><html><head></head><body>OneTwoThree<link />FourFive Six Seven </body></html>", TextUtil.stripNewlines(doc.html())); } @Test public void normalisesEmptyDocument() { Document doc = Jsoup.parse(""); assertEquals("<html><head></head><body></body></html>", TextUtil.stripNewlines(doc.html())); } @Test public void normalisesHeadlessBody() { Document doc = Jsoup.parse("<html><body><span class=\"foo\">bar</span>"); assertEquals("<html><head></head><body><span class=\"foo\">bar</span></body></html>", TextUtil.stripNewlines(doc.html())); } @Test public void findsCharsetInMalformedMeta() { String h = "<meta http-equiv=Content-Type content=text/html; charset=gb2312>"; // example cited for reason of html5's <meta charset> element Document doc = Jsoup.parse(h); assertEquals("gb2312", doc.select("meta").attr("charset")); } @Test public void testHgroup() { // jsoup used to not allow hroup in h{n}, but that's not in spec, and browsers are OK Document doc = Jsoup.parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>"); assertEquals("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup> <hgroup><h1>More</h1><p>stuff</p></h

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>>Hello <div>there</div> <span>now</span></a>", TextUtil.stripNewlines(doc.body().html())); } @Test public void testFontFlowContents() { // html5 has no definition of <font>; often used as flow Document doc = Jsoup.parse("<font>Hello <div>there</div> <span>now</span></font>"); assertEquals("<font>Hello <div>there</div> <span>now</span></font>", TextUtil.stripNewlines(doc.body().html())); } @Test public void handlesMisnestedTagsBI() { // whatwg: <b><i></b></i> String h = "<p>1<b>2<i>3</b>4</i>5</p>"; Document doc = Jsoup.parse(h); assertEquals("<p>1<b>2<i>3</i></b><i>4</i>5</p>", doc.body().html()); // adoption agency on </b>, reconstruction of formatters on 4. } @Test public void handlesMisnestedTagsBP() { // whatwg: <b><p></b></p> String h = "<b>1<p>2</b>3</p>"; Document doc = Jsoup.parse(h); assertEquals("<b>1</b>\n<p><b>2</b>3</p>", doc.body().html()); } @Test public void handlesUnexpectedMarkupInTables() { // whatwg - tests markers in active formatting (if they didn't work, would get in in table) // also tests foster parenting String h = "<table><b><tr><td>aaa</td></tr>bbb</table>ccc"; Document doc = Jsoup.parse(h); assertEquals("<b></b><b>bbb</b><table><tbody><tr><td>aaa</td></tr></tbody></table><b>ccc</b>", TextUtil.stripNewlines(doc.body().html())); } @Test public void handlesUnclosedFormattingElements() { // whatwg: formatting elements get collected and applied, but excess elements are thrown away String h = "<!DOCTYPE html>\n" + "<p><b class=x><b class=x><b><b class=x><b class=x><b

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>>X\n" + "<p>X\n" + "<p><b><b class=x><b>X\n" + "<p></b></b></b></b></b></b>X"; Document doc = Jsoup.parse(h); doc.outputSettings().indentAmount(0); String want = "<!DOCTYPE html>\n" + "<html>\n" + "<head></head>\n" + "<body>\n" + "<p><b class=\"x\"><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b>X </b></b></b></b></b></b></p>\n" + "<p><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b>X </b></b></b></b></b></p>\n" + "<p><b class=\"x\"><b><b class=\"x\"><b class=\"x\"><b><b><b class=\"x\"><b>X </b></b></b></b></b></b></b></b></p>\n" + "<p>X</p>\n" + "</body>\n" + "</html>"; assertEquals(want, doc.html()); } @Test public void reconstructFormattingElements() { // tests attributes and multi b String h = "<p><b class=one>One <i>Two <b>Three</p><p>Hello</p>"; Document doc = Jsoup.parse(h); assertEquals("<p><b class=\"one\">One <i>Two <b>Three</b></i></b></p>\n<p><b class=\"one\"><i><b>Hello</b></i></b></p>", doc.body().html()); } @Test public void reconstructFormattingElementsInTable() { // tests that tables get formatting markers -- the <b> applies outside the table and does not leak in, // and the <i> inside the table and does not leak out. String h = "<p><b>One</p> <table><tr><td><p><i>Three<p>Four</i></td></tr></table> <p>Five</p>"; Document doc = Jsoup.parse(h); String want = "<p><b>One</b></p>\n" + "<b> \n"

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> + " <table>\n" + " <tbody>\n" + " <tr>\n" + " <td><p><i>Three</i></p><p><i>Four</i></p></td>\n" + " </tr>\n" + " </tbody>\n" + " </table> <p>Five</p></b>"; assertEquals(want, doc.body().html()); } @Test public void commentBeforeHtml() { String h = "<!-- comment --><!-- comment 2 --><p>One</p>"; Document doc = Jsoup.parse(h); assertEquals("<!-- comment --><!-- comment 2 --><html><head></head><body><p>One</p></body></html>", TextUtil.stripNewlines(doc.html())); } @Test public void emptyTdTag() { String h = "<table><tr><td>One</td><td id='2' /></tr></table>"; Document doc = Jsoup.parse(h); assertEquals("<td>One</td>\n<td id=\"2\"></td>", doc.select("tr").first().html()); } @Test public void handlesSolidusInA() { // test for bug #66 String h = "<a class=lp href=/lib/14160711/>link text</a>"; Document doc = Jsoup.parse(h); Element a = doc.select("a").first(); assertEquals("link text", a.text()); assertEquals("/lib/14160711/", a.attr("href")); } @Test public void handlesSpanInTbody() { // test for bug 64 String h = "<table><tbody><span class='1'><tr><td>One</td></tr><tr><td>Two</td></tr></span></tbody></table>"; Document doc = Jsoup.parse(h); assertEquals(doc.select("span").first().children().size(), 0); // the span gets closed assertEquals(doc.select("table").size(), 1); // only one table } @Test public void handlesUnclosedTitleAtEof() { assertEquals("Data", Jsoup.parse("<title>Data").title()); assertEquals("Data<", Jsoup.parse("<title>Data<").title()); assertEquals("Data</

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; import org.jsoup.helper.StringUtil; /** * A {@code <!DOCTPYE>} node. */ public class DocumentType extends Node { // todo: quirk mode from publicId and systemId private DocumentType() {} public DocumentType(String name, String publicId, String systemId, String baseUri) { super(baseUri); attr("name", name); attr("publicId", publicId); attr("systemId", systemId); } @Override public String nodeName() { return "#doctype"; } @Override void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { accum.append("<!DOCTYPE html"); if (!StringUtil.isBlank(attr("publicId"))) accum.append(" PUBLIC \"").append(attr("publicId")).append("\""); if (!StringUtil.isBlank(attr("systemId"))) accum.append(' ').append(attr("systemId")).append("\""); accum.append('>'); } @Override void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) { } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; import org.jsoup.helper.Validate; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; /** * Parse tokens for the Tokeniser. */ abstract class Token { TokenType type; private Token() { } static class Doctype extends Token { final StringBuilder name = new StringBuilder(); final StringBuilder publicIdentifier = new StringBuilder(); final StringBuilder systemIdentifier = new StringBuilder(); boolean forceQuirks = false; Doctype() { type = TokenType.Doctype; } String getName() { return name.toString(); } String getPublicIdentifier() { return publicIdentifier.toString(); } public String getSystemIdentifier() { return systemIdentifier.toString(); } public boolean isForceQuirks() { return forceQuirks; } } static abstract class Tag extends Token { protected String tagName; private String pendingAttributeName; private String pendingAttributeValue; boolean selfClosing = false; Attributes attributes = new Attributes(); // todo: allow nodes to not have attributes void newAttribute() { if (pendingAttributeName != null) { if (pendingAttributeValue == null) pendingAttributeValue = ""; Attribute attribute = new Attribute(pendingAttributeName, pendingAttributeValue); attributes.put(attribute); } pendingAttributeName = null; pendingAttributeValue = null; } void finaliseTag() { // finalises for emit if (pendingAttributeName != null) { // todo: check if attribute name exists; if so, drop and error newAttribute(); } } String name() { Validate.isFalse(tagName.isEmpty()); return tagName; } Tag name(String name) { tagName = name; return this; } boolean isSelfClosing() { return selfClosing; } @SuppressWarnings({"TypeMayBeWeakened"}) Attributes getAttributes() { return attributes; } // these appenders are rarely hit in not null state-- caused by null chars. void appendTagName(String append) { tagName = tagName == null ? append : tagName.concat(append); } void appendTagName(char append) { appendTagName(String.valueOf(append)); } void appendAttributeName(String append) { pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append); } void appendAttributeName(char append)

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> { appendAttributeName(String.valueOf(append)); } void appendAttributeValue(String append) { pendingAttributeValue = pendingAttributeValue == null ? append : pendingAttributeValue.concat(append); } void appendAttributeValue(char append) { appendAttributeValue(String.valueOf(append)); } } static class StartTag extends Tag { StartTag() { super(); type = TokenType.StartTag; } StartTag(String name) { this(); this.tagName = name; } StartTag(String name, Attributes attributes) { this(); this.tagName = name; this.attributes = attributes; } @Override public String toString() { return "<" + name() + " " + attributes.toString() + ">"; } } static class EndTag extends Tag{ EndTag() { super(); type = TokenType.EndTag; } EndTag(String name) { this(); this.tagName = name; } @Override public String toString() { return "</" + name() + " " + attributes.toString() + ">"; } } static class Comment extends Token { final StringBuilder data = new StringBuilder(); Comment() { type = TokenType.Comment; } String getData() { return data.toString(); } @Override public String toString() { return "<!--" + getData() + "-->"; } } static class Character extends Token { private final String data; Character(String data) { type = TokenType.Character; this.data = data; } String getData() { return data; } @Override public String toString() { return getData(); } } static class EOF extends Token { EOF() { type = Token.TokenType.EOF; } } boolean isDoctype() { return type == TokenType.Doctype; } Doctype asDoctype() { return (Doctype) this; } boolean isStartTag() { return type == TokenType.StartTag; } StartTag asStartTag() { return (StartTag) this; } boolean isEndTag() { return type == TokenType.EndTag; } EndTag asEndTag() { return (EndTag) this; } boolean isComment() { return type == TokenType.Comment; } Comment asComment()

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> { return (Comment) this; } boolean isCharacter() { return type == TokenType.Character; } Character asCharacter() { return (Character) this; } boolean isEOF() { return type == TokenType.EOF; } enum TokenType { Doctype, StartTag, EndTag, Comment, Character, EOF } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.select; import org.jsoup.helper.Validate; import org.jsoup.nodes.Element; import java.util.*; /** A list of {@link Element Elements}, with methods that act on every element in the list @author Jonathan Hedley, jonathan@hedley.net */ public class Elements implements List<Element>, Cloneable { private List<Element> contents; public Elements() { contents = new ArrayList<Element>(); } public Elements(Collection<Element> elements) { contents = new ArrayList<Element>(elements); } public Elements(List<Element> elements) { contents = elements; } public Elements(Element... elements) { this(Arrays.asList(elements)); } @Override public Elements clone() { List<Element> elements = new ArrayList<Element>(); for(Element e : contents) elements.add(e.clone()); return new Elements(elements); } // attribute methods /** Get an attribute value from the first matched element that has the attribute. @param attributeKey The attribute key. @return The attribute value from the first matched element that has the attribute.. If no elements were matched (isEmpty() == true), or if the no elements have the attribute, returns empty string. @see #hasAttr(String) */ public String attr(String attributeKey) { for (Element element : contents) { if (element.hasAttr(attributeKey)) return element.attr(attributeKey); } return ""; } /** Checks if any of the matched elements have this attribute set. @param attributeKey attribute key @return true if any of the elements have the attribute; false if none do. */ public boolean hasAttr(String attributeKey) { for (Element element : contents) { if (element.hasAttr(attributeKey)) return true; } return false; } /** * Set an attribute on all matched elements. * @param attributeKey attribute key * @param attributeValue attribute value * @return this */ public Elements attr(String attributeKey, String attributeValue) { for (Element element : contents) { element.attr(attributeKey, attributeValue); } return this; } /** * Remove an attribute from every matched element. * @param attribute

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>Key The attribute to remove. * @return this (for chaining) */ public Elements removeAttr(String attributeKey) { for (Element element : contents) { element.removeAttr(attributeKey); } return this; } /** Add the class name to every matched element's {@code class} attribute. @param className class name to add @return this */ public Elements addClass(String className) { for (Element element : contents) { element.addClass(className); } return this; } /** Remove the class name from every matched element's {@code class} attribute, if present. @param className class name to remove @return this */ public Elements removeClass(String className) { for (Element element : contents) { element.removeClass(className); } return this; } /** Toggle the class name on every matched element's {@code class} attribute. @param className class name to add if missing, or remove if present, from every element. @return this */ public Elements toggleClass(String className) { for (Element element : contents) { element.toggleClass(className); } return this; } /** Determine if any of the matched elements have this class name set in their {@code class} attribute. @param className class name to check for @return true if any do, false if none do */ public boolean hasClass(String className) { for (Element element : contents) { if (element.hasClass(className)) return true; } return false; } /** * Get the form element's value of the first matched element. * @return The form element's value, or empty if not set. * @see Element#val() */ public String val() { if (size() > 0) return first().val(); else return ""; } /** * Set the form element's value in each of the matched elements. * @param value The value to set into each matched element * @return this (for chaining) */ public Elements val(String value) { for (Element element : contents) element.val(value); return this; } /** * Get the combined text of all the matched elements. * <p> * Note that it is possible to get repeats if the matched elements contain both parent elements and

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> their own * children, as the Element.text() method returns the combined text of a parent and all its children. * @return string of all text: unescaped and no HTML. * @see Element#text() */ public String text() { StringBuilder sb = new StringBuilder(); for (Element element : contents) { if (sb.length() != 0) sb.append(" "); sb.append(element.text()); } return sb.toString(); } public boolean hasText() { for (Element element: contents) { if (element.hasText()) return true; } return false; } /** * Get the combined inner HTML of all matched elements. * @return string of all element's inner HTML. * @see #text() * @see #outerHtml() */ public String html() { StringBuilder sb = new StringBuilder(); for (Element element : contents) { if (sb.length() != 0) sb.append("\n"); sb.append(element.html()); } return sb.toString(); } /** * Get the combined outer HTML of all matched elements. * @return string of all element's outer HTML. * @see #text() * @see #html() */ public String outerHtml() { StringBuilder sb = new StringBuilder(); for (Element element : contents) { if (sb.length() != 0) sb.append("\n"); sb.append(element.outerHtml()); } return sb.toString(); } /** * Get the combined outer HTML of all matched elements. Alias of {@link #outerHtml()}. * @return string of all element's outer HTML. * @see #text() * @see #html() */ public String toString() { return outerHtml(); } /** * Update the tag name of each matched element. For example, to change each {@code <i>} to a {@code <em>}, do * {@code doc.select("i").tagName("em");} * @param tagName the new tag name * @return this, for chaining * @see Element#tagName(String) */ public Elements tagName(String tagName) { for (Element element : contents) { element.tagName(tagName); } return this; } /** * Set the inner HTML of each

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> matched element. * @param html HTML to parse and set into each matched element. * @return this, for chaining * @see Element#html(String) */ public Elements html(String html) { for (Element element : contents) { element.html(html); } return this; } /** * Add the supplied HTML to the start of each matched element's inner HTML. * @param html HTML to add inside each element, before the existing HTML * @return this, for chaining * @see Element#prepend(String) */ public Elements prepend(String html) { for (Element element : contents) { element.prepend(html); } return this; } /** * Add the supplied HTML to the end of each matched element's inner HTML. * @param html HTML to add inside each element, after the existing HTML * @return this, for chaining * @see Element#append(String) */ public Elements append(String html) { for (Element element : contents) { element.append(html); } return this; } /** * Insert the supplied HTML before each matched element's outer HTML. * @param html HTML to insert before each element * @return this, for chaining * @see Element#before(String) */ public Elements before(String html) { for (Element element : contents) { element.before(html); } return this; } /** * Insert the supplied HTML after each matched element's outer HTML. * @param html HTML to insert after each element * @return this, for chaining * @see Element#after(String) */ public Elements after(String html) { for (Element element : contents) { element.after(html); } return this; } /** Wrap the supplied HTML around each matched elements. For example, with HTML {@code <p><b>This</b> is <b>Jsoup</b></p>}, <code>doc.select("b").wrap("&lt;i&gt;&lt;/i&gt;");</code> becomes {@code <p><i><b>This</b></i> is <i><b>jsoup</b></i></p>} @param html HTML to wrap around each element, e.g. {@code <div class="head"></div>

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>}. Can be arbitrarily deep. @return this (for chaining) @see Element#wrap */ public Elements wrap(String html) { Validate.notEmpty(html); for (Element element : contents) { element.wrap(html); } return this; } /** * Empty (remove all child nodes from) each matched element. This is similar to setting the inner HTML of each * element to nothing. * <p> * E.g. HTML: {@code <div><p>Hello <b>there</b></p> <p>now</p></div>}<br> * <code>doc.select("p").empty();</code><br> * HTML = {@code <div><p></p> <p></p></div>} * @return this, for chaining * @see Element#empty() * @see #remove() */ public Elements empty() { for (Element element : contents) { element.empty(); } return this; } /** * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing. * <p> * E.g. HTML: {@code <div><p>Hello</p> <p>there</p> <img /></div>}<br> * <code>doc.select("p").remove();</code><br> * HTML = {@code <div> <img /></div>} * <p> * Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML. * @return this, for chaining * @see Element#empty() * @see #empty() */ public Elements remove() { for (Element element : contents) { element.remove(); } return this; } // filters /** * Find matching elements within this element list. * @param query A {@link Selector} query * @return the filtered list of elements, or an empty list if none match. */ public Elements select(String query) { return Selector.select(query, this); } /** * Remove elements from this list that do not match the {@link Selector} query. * <p> * E.g. HTML: {@code <div class=logo>One</div> <

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>div>Two</div>}<br> * <code>Elements divs = doc.select("div").not("#logo");</code><br> * Result: {@code divs: [<div>Two</div>]} * <p> * @param query the selector query whose results should be removed from these elements * @return a new elements list that contains only the filtered results */ public Elements not(String query) { Elements out = Selector.select(query, this); return Selector.filterOut(this, out); } /** * Get the <i>nth</i> matched element as an Elements object. * <p> * See also {@link #get(int)} to retrieve an Element. * @param index the (zero-based) index of the element in the list to retain * @return Elements containing only the specified element, or, if that element did not exist, an empty list. */ public Elements eq(int index) { return contents.size() > index ? new Elements(get(index)) : new Elements(); } /** * Test if any of the matched elements match the supplied query. * @param query A selector * @return true if at least one element in the list matches the query. */ public boolean is(String query) { Elements children = select(query); return !children.isEmpty(); } /** * Get all of the parents and ancestor elements of the matched elements. * @return */ public Elements parents() { HashSet<Element> combo = new LinkedHashSet<Element>(); for (Element e: contents) { combo.addAll(e.parents()); } return new Elements(combo); } // list-like methods /** Get the first matched element. @return The first matched element, or <code>null</code> if contents is empty; */ public Element first() { return contents.isEmpty() ? null : contents.get(0); } /** Get the last matched element. @return The last matched element, or <code>null</code> if contents is empty. */ public Element last() { return contents.isEmpty() ? null : contents.get(contents.size() - 1); } // implements List<Element> delegates: public int size() {return contents.size();} public boolean isEmpty() {return contents.isEmpty();}

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; import org.jsoup.helper.Validate; import java.util.*; /** * The attributes of an Element. * <p/> * Attributes are treated as a map: there can be only one value associated with an attribute key. * <p/> * Attribute key and value comparisons are done case insensitively, and keys are normalised to * lower-case. * * @author Jonathan Hedley, jonathan@hedley.net */ public class Attributes implements Iterable<Attribute>, Cloneable { protected static final String dataPrefix = "data-"; private LinkedHashMap<String, Attribute> attributes = null; // linked hash map to preserve insertion order. // null be default as so many elements have no attributes -- saves a good chunk of memory /** Get an attribute value by key. @param key the attribute key @return the attribute value if set; or empty string if not set. @see #hasKey(String) */ public String get(String key) { Validate.notEmpty(key); if (attributes == null) return ""; Attribute attr = attributes.get(key.toLowerCase()); return attr != null ? attr.getValue() : ""; } /** Set a new attribute, or replace an existing one by key. @param key attribute key @param value attribute value */ public void put(String key, String value) { Attribute attr = new Attribute(key, value); put(attr); } /** Set a new attribute, or replace an existing one by key. @param attribute attribute */ public void put(Attribute attribute) { Validate.notNull(attribute); if (attributes == null) attributes = new LinkedHashMap<String, Attribute>(2); attributes.put(attribute.getKey(), attribute); } /** Remove an attribute by key. @param key attribute key to remove */ public void remove(String key) { Validate.notEmpty(key); if (attributes == null) return; attributes.remove(key.toLowerCase()); } /** Tests if these attributes contain an attribute with this key. @param key key to check for @return true if key exists, false otherwise */ public boolean hasKey(String key) { return attributes != null && attributes.containsKey(key.toLowerCase()); } /** Get the number of attributes

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> in this set. @return size */ public int size() { if (attributes == null) return 0; return attributes.size(); } /** Add all the attributes from the incoming set to this set. @param incoming attributes to add to these attributes. */ public void addAll(Attributes incoming) { if (incoming.size() == 0) return; if (attributes == null) attributes = new LinkedHashMap<String, Attribute>(incoming.size()); attributes.putAll(incoming.attributes); } public Iterator<Attribute> iterator() { return asList().iterator(); } /** Get the attributes as a List, for iteration. Do not modify the keys of the attributes via this view, as changes to keys will not be recognised in the containing set. @return an view of the attributes as a List. */ public List<Attribute> asList() { if (attributes == null) return Collections.emptyList(); List<Attribute> list = new ArrayList<Attribute>(attributes.size()); for (Map.Entry<String, Attribute> entry : attributes.entrySet()) { list.add(entry.getValue()); } return Collections.unmodifiableList(list); } /** * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys * starting with {@code data-}. * @return map of custom data attributes. */ public Map<String, String> dataset() { return new Dataset(); } /** Get the HTML representation of these attributes. @return HTML */ public String html() { StringBuilder accum = new StringBuilder(); html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used return accum.toString(); } void html(StringBuilder accum, Document.OutputSettings out) { if (attributes == null) return; for (Map.Entry<String, Attribute> entry : attributes.entrySet()) { Attribute attribute = entry.getValue(); accum.append(" "); attribute.html(accum, out); } } public String toString() { return html(); } @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Attributes)) return false; Attributes that = (Attributes) o; if (attributes != null ? !

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>attributes.equals(that.attributes) : that.attributes != null) return false; return true; } @Override public int hashCode() { return attributes != null ? attributes.hashCode() : 0; } @Override public Attributes clone() { if (attributes == null) return new Attributes(); Attributes clone; try { clone = (Attributes) super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } clone.attributes = new LinkedHashMap<String, Attribute>(attributes.size()); for (Attribute attribute: this) clone.attributes.put(attribute.getKey(), attribute.clone()); return clone; } private class Dataset extends AbstractMap<String, String> { private Dataset() { if (attributes == null) attributes = new LinkedHashMap<String, Attribute>(2); } public Set<Entry<String, String>> entrySet() { return new EntrySet(); } @Override public String put(String key, String value) { String dataKey = dataKey(key); String oldValue = hasKey(dataKey) ? attributes.get(dataKey).getValue() : null; Attribute attr = new Attribute(dataKey, value); attributes.put(dataKey, attr); return oldValue; } private class EntrySet extends AbstractSet<Map.Entry<String, String>> { public Iterator<Map.Entry<String, String>> iterator() { return new DatasetIterator(); } public int size() { int count = 0; Iterator iter = new DatasetIterator(); while (iter.hasNext()) count++; return count; } } private class DatasetIterator implements Iterator<Map.Entry<String, String>> { private Iterator<Attribute> attrIter = attributes.values().iterator(); private Attribute attr; public boolean hasNext() { while (attrIter.hasNext()) { attr = attrIter.next(); if (attr.isDataAttribute()) return true; } return false; } public Entry<String, String> next() { return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue()); } public void remove() { attributes.remove(attr.getKey()); } } } private static String dataKey(String key) { return dataPrefix + key; } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; import org.jsoup.helper.Validate; import org.jsoup.parser.Tag; import org.jsoup.select.Elements; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.ArrayList; import java.util.List; /** A HTML Document. @author Jonathan Hedley, jonathan@hedley.net */ public class Document extends Element { private OutputSettings outputSettings = new OutputSettings(); private QuirksMode quirksMode = QuirksMode.noQuirks; /** Create a new, empty Document. @param baseUri base URI of document @see org.jsoup.Jsoup#parse @see #createShell */ public Document(String baseUri) { super(Tag.valueOf("#root"), baseUri); } /** Create a valid, empty shell of a document, suitable for adding more elements to. @param baseUri baseUri of document @return document with html, head, and body elements. */ static public Document createShell(String baseUri) { Validate.notNull(baseUri); Document doc = new Document(baseUri); Element html = doc.appendElement("html"); html.appendElement("head"); html.appendElement("body"); return doc; } /** Accessor to the document's {@code head} element. @return {@code head} */ public Element head() { return findFirstElementByTagName("head", this); } /** Accessor to the document's {@code body} element. @return {@code body} */ public Element body() { return findFirstElementByTagName("body", this); } /** Get the string contents of the document's {@code title} element. @return Trimed title, or empty string if none set. */ public String title() { Element titleEl = getElementsByTag("title").first(); return titleEl != null ? titleEl.text().trim() : ""; } /** Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if not present @param title string to set as title */ public void title(String title) { Validate.notNull(title); Element

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> titleEl = getElementsByTag("title").first(); if (titleEl == null) { // add to head head().appendElement("title").text(title); } else { titleEl.text(title); } } /** Create a new Element, with this document's base uri. Does not make the new element a child of this document. @param tagName element tag name (e.g. {@code a}) @return new element */ public Element createElement(String tagName) { return new Element(Tag.valueOf(tagName), this.baseUri()); } /** Normalise the document. This happens after the parse phase so generally does not need to be called. Moves any text content that is not in the body element into the body. @return this document after normalisation */ public Document normalise() { Element htmlEl = findFirstElementByTagName("html", this); if (htmlEl == null) htmlEl = appendElement("html"); if (head() == null) htmlEl.prependElement("head"); if (body() == null) htmlEl.appendElement("body"); // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care // of. do in inverse order to maintain text order. normaliseTextNodes(head()); normaliseTextNodes(htmlEl); normaliseTextNodes(this); normaliseStructure("head", htmlEl); normaliseStructure("body", htmlEl); return this; } // does not recurse. private void normaliseTextNodes(Element element) { List<Node> toMove = new ArrayList<Node>(); for (Node node: element.childNodes) { if (node instanceof TextNode) { TextNode tn = (TextNode) node; if (!tn.isBlank()) toMove.add(tn); } } for (int i = toMove.size()-1; i >= 0; i--) { Node node = toMove.get(i); element.removeChild(node); body().prependChild(new TextNode(" ", "")); body().prependChild(node); } } // merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html> private void normaliseStructure(String tag, Element htmlEl) {

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> Elements elements = this.getElementsByTag(tag); Element master = elements.first(); // will always be available as created above if not existent if (elements.size() > 1) { // dupes, move contents to master List<Node> toMove = new ArrayList<Node>(); for (int i = 1; i < elements.size(); i++) { Node dupe = elements.get(i); for (Node node : dupe.childNodes) toMove.add(node); dupe.remove(); } for (Node dupe : toMove) master.appendChild(dupe); } // ensure parented by <html> if (!master.parent().equals(htmlEl)) { htmlEl.appendChild(master); // includes remove() } } // fast method to get first by tag name, used for html, head, body finders private Element findFirstElementByTagName(String tag, Node node) { if (node.nodeName().equals(tag)) return (Element) node; else { for (Node child: node.childNodes) { Element found = findFirstElementByTagName(tag, child); if (found != null) return found; } } return null; } @Override public String outerHtml() { return super.html(); // no outer wrapper tag } /** Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. @param text unencoded text @return this document */ @Override public Element text(String text) { body().text(text); // overridden to not nuke doc structure return this; } @Override public String nodeName() { return "#document"; } @Override public Document clone() { Document clone = (Document) super.clone(); clone.outputSettings = this.outputSettings.clone(); return clone; } /** * A Document's output settings control the form of the text() and html() methods. */ public static class OutputSettings implements Cloneable { private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; private Charset charset = Charset.forName("UTF-8"); private CharsetEncoder charsetEncoder = charset.newEncoder(); private boolean prettyPrint = true; private int indentAmount = 1; public OutputSettings() {} /**

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>, * which uses the complete set of HTML named entities. * <p> * The default escape mode is <code>base</code>. * @return the document's current escape mode */ public Entities.EscapeMode escapeMode() { return escapeMode; } /** * Set the document's escape mode * @param escapeMode the new escape mode to use * @return the document's output settings, for chaining */ public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { this.escapeMode = escapeMode; return this; } /** * Get the document's current output charset, which is used to control which characters are escaped when * generating HTML (via the <code>html()</code> methods), and which are kept intact. * <p> * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the * input charset. Otherwise, it defaults to UTF-8. * @return the document's current charset. */ public Charset charset() { return charset; } /** * Update the document's output charset. * @param charset the new charset to use. * @return the document's output settings, for chaining */ public OutputSettings charset(Charset charset) { // todo: this should probably update the doc's meta charset this.charset = charset; charsetEncoder = charset.newEncoder(); return this; } /** * Update the document's output charset. * @param charset the new charset (by name) to use. * @return the document's output settings, for chaining */ public OutputSettings charset(String charset) { charset(Charset.forName(charset)); return this; } CharsetEncoder encoder() { return charsetEncoder; } /** * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format * the output, and the output will generally look like the input. * @return if pretty printing is enabled. */ public boolean prettyPrint() { return prettyPrint; } /** * Enable or disable pretty printing. * @param pretty new pretty print

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; /** A comment node. @author Jonathan Hedley, jonathan@hedley.net */ public class Comment extends Node { private static final String COMMENT_KEY = "comment"; /** Create a new comment node. @param data The contents of the comment @param baseUri base URI */ public Comment(String data, String baseUri) { super(baseUri); attributes.put(COMMENT_KEY, data); } public String nodeName() { return "#comment"; } /** Get the contents of the comment. @return comment content */ public String getData() { return attributes.get(COMMENT_KEY); } void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { if (out.prettyPrint()) indent(accum, depth, out); accum .append("<!--") .append(getData()) .append("-->"); } void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} public String toString() { return outerHtml(); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; import org.jsoup.helper.StringUtil; import org.jsoup.helper.Validate; import org.jsoup.parser.Parser; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** The base, abstract Node model. Elements, Documents, Comments etc are all Node instances. @author Jonathan Hedley, jonathan@hedley.net */ public abstract class Node implements Cloneable { Node parentNode; List<Node> childNodes; Attributes attributes; String baseUri; int siblingIndex; /** Create a new Node. @param baseUri base URI @param attributes attributes (not null, but may be empty) */ protected Node(String baseUri, Attributes attributes) { Validate.notNull(baseUri); Validate.notNull(attributes); childNodes = new ArrayList<Node>(4); this.baseUri = baseUri.trim(); this.attributes = attributes; } protected Node(String baseUri) { this(baseUri, new Attributes()); } /** * Default constructor. Doesn't setup base uri, children, or attributes; use with caution. */ protected Node() { childNodes = Collections.emptyList(); attributes = null; } /** Get the node name of this node. Use for debugging purposes and not logic switching (for that, use instanceof). @return node name */ public abstract String nodeName(); /** * Get an attribute's value by its key. * <p/> * To get an absolute URL from an attribute that may be a relative URL, prefix the key with <code><b>abs</b></code>, * which is a shortcut to the {@link #absUrl} method. * E.g.: <blockquote><code>String url = a.attr("abs:href");</code></blockquote> * @param attributeKey The attribute key. * @return The attribute, or empty string if not present (to avoid nulls). * @see #attributes() * @see #hasAttr(String) * @see #absUrl(String) */ public String attr

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>(String attributeKey) { Validate.notNull(attributeKey); if (attributes.hasKey(attributeKey)) return attributes.get(attributeKey); else if (attributeKey.toLowerCase().startsWith("abs:")) return absUrl(attributeKey.substring("abs:".length())); else return ""; } /** * Get all of the element's attributes. * @return attributes (which implements iterable, in same order as presented in original HTML). */ public Attributes attributes() { return attributes; } /** * Set an attribute (key=value). If the attribute already exists, it is replaced. * @param attributeKey The attribute key. * @param attributeValue The attribute value. * @return this (for chaining) */ public Node attr(String attributeKey, String attributeValue) { attributes.put(attributeKey, attributeValue); return this; } /** * Test if this element has an attribute. * @param attributeKey The attribute key to check. * @return true if the attribute exists, false if not. */ public boolean hasAttr(String attributeKey) { Validate.notNull(attributeKey); if (attributeKey.toLowerCase().startsWith("abs:")) { String key = attributeKey.substring("abs:".length()); if (attributes.hasKey(key) && !absUrl(key).equals("")) return true; } return attributes.hasKey(attributeKey); } /** * Remove an attribute from this element. * @param attributeKey The attribute to remove. * @return this (for chaining) */ public Node removeAttr(String attributeKey) { Validate.notNull(attributeKey); attributes.remove(attributeKey); return this; } /** Get the base URI of this node. @return base URI */ public String baseUri() { return baseUri; } /** Update the base URI of this node. @param baseUri base URI to set */ public void setBaseUri(String baseUri) { Validate.notNull(baseUri); this.baseUri = baseUri; } /** * Get an absolute URL from a URL attribute that may be relative (i.e. an <code>&lt;a href></code> or * <code>&lt;img src></code>). * <p/> * E.g.: <code>

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>String absUrl = linkEl.absUrl("href");</code> * <p/> * If the attribute value is already absolute (i.e. it starts with a protocol, like * <code>http://</code> or <code>https://</code> etc), and it successfully parses as a URL, the attribute is * returned directly. Otherwise, it is treated as a URL relative to the element's {@link #baseUri}, and made * absolute using that. * <p/> * As an alternate, you can use the {@link #attr} method with the <code>abs:</code> prefix, e.g.: * <code>String absUrl = linkEl.attr("abs:href");</code> * * @param attributeKey The attribute key * @return An absolute URL if one could be made, or an empty string (not null) if the attribute was missing or * could not be made successfully into a URL. * @see #attr * @see java.net.URL#URL(java.net.URL, String) */ public String absUrl(String attributeKey) { Validate.notEmpty(attributeKey); String relUrl = attr(attributeKey); if (!hasAttr(attributeKey)) { return ""; // nothing to make absolute with } else { URL base; try { try { base = new URL(baseUri); } catch (MalformedURLException e) { // the base is unsuitable, but the attribute may be abs on its own, so try that URL abs = new URL(relUrl); return abs.toExternalForm(); } // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired if (relUrl.startsWith("?")) relUrl = base.getPath() + relUrl; URL abs = new URL(base, relUrl); return abs.toExternalForm(); } catch (MalformedURLException e) { return ""; } } } /** Get a child node by index @param index index of child node @return the child node at this index. */ public Node childNode(int index) { return childNodes.get(index); } /** Get this node's children. Presented as an unmodifiable list: new children can not be added, but the child nodes themselves

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> can be manipulated. @return list of children. If no children, returns an empty list. */ public List<Node> childNodes() { return Collections.unmodifiableList(childNodes); } protected Node[] childNodesAsArray() { return childNodes.toArray(new Node[childNodes().size()]); } /** Gets this node's parent node. @return parent node; or null if no parent. */ public Node parent() { return parentNode; } /** * Gets the Document associated with this Node. * @return the Document associated with this Node, or null if there is no such Document. */ public Document ownerDocument() { if (this instanceof Document) return (Document) this; else if (parentNode == null) return null; else return parentNode.ownerDocument(); } /** * Remove (delete) this node from the DOM tree. If this node has children, they are also removed. */ public void remove() { Validate.notNull(parentNode); parentNode.removeChild(this); } /** * Insert the specified HTML into the DOM before this node (i.e. as a preceeding sibling). * @param html HTML to add before this node * @return this node, for chaining * @see #after(String) */ public Node before(String html) { addSiblingHtml(siblingIndex(), html); return this; } /** * Insert the specified node into the DOM before this node (i.e. as a preceeding sibling). * @param node to add before this node * @return this node, for chaining * @see #after(Node) */ public Node before(Node node) { Validate.notNull(node); Validate.notNull(parentNode); parentNode.addChildren(siblingIndex(), node); return this; } /** * Insert the specified HTML into the DOM after this node (i.e. as a following sibling). * @param html HTML to add after this node * @return this node, for chaining * @see #before(String) */ public Node after(String html) { addSiblingHtml(siblingIndex()+1, html); return this; } /** * Insert the specified node into the DOM after this node (i.e. as a following sibling). * @param node to add after this node * @return

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> this node, for chaining * @see #before(Node) */ public Node after(Node node) { Validate.notNull(node); Validate.notNull(parentNode); parentNode.addChildren(siblingIndex()+1, node); return this; } private void addSiblingHtml(int index, String html) { Validate.notNull(html); Validate.notNull(parentNode); Element context = parent() instanceof Element ? (Element) parent() : null; List<Node> nodes = Parser.parseFragment(html, context, baseUri()); parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()])); } /** Wrap the supplied HTML around this node. @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. @return this node, for chaining. */ public Node wrap(String html) { Validate.notEmpty(html); Element context = parent() instanceof Element ? (Element) parent() : null; List<Node> wrapChildren = Parser.parseFragment(html, context, baseUri()); Node wrapNode = wrapChildren.get(0); if (wrapNode == null || !(wrapNode instanceof Element)) // nothing to wrap with; noop return null; Element wrap = (Element) wrapNode; Element deepest = getDeepChild(wrap); parentNode.replaceChild(this, wrap); deepest.addChildren(this); // remainder (unbalanced wrap, like <div></div><p></p> -- The <p> is remainder if (wrapChildren.size() > 0) { for (int i = 0; i < wrapChildren.size(); i++) { Node remainder = wrapChildren.get(i); remainder.parentNode.removeChild(remainder); wrap.appendChild(remainder); } } return this; } private Element getDeepChild(Element el) { List<Element> children = el.children(); if (children.size() > 0) return getDeepChild(children.get(0)); else return el; } /** * Replace this node in the DOM with the supplied node. * @param in the node that will will replace the existing node. */ public void replaceWith(Node in) { Validate.notNull(in);

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> this is the last sibling */ public Node nextSibling() { if (parentNode == null) return null; // root List<Node> siblings = parentNode.childNodes; Integer index = siblingIndex(); Validate.notNull(index); if (siblings.size() > index+1) return siblings.get(index+1); else return null; } /** Get this node's previous sibling. @return the previous sibling, or null if this is the first sibling */ public Node previousSibling() { List<Node> siblings = parentNode.childNodes; Integer index = siblingIndex(); Validate.notNull(index); if (index > 0) return siblings.get(index-1); else return null; } /** * Get the list index of this node in its node sibling list. I.e. if this is the first node * sibling, returns 0. * @return position in node sibling list * @see org.jsoup.nodes.Element#elementSiblingIndex() */ public int siblingIndex() { return siblingIndex; } protected void setSiblingIndex(int siblingIndex) { this.siblingIndex = siblingIndex; } /** Get the outer HTML of this node. @return HTML */ public String outerHtml() { StringBuilder accum = new StringBuilder(32*1024); outerHtml(accum); return accum.toString(); } protected void outerHtml(StringBuilder accum) { new NodeTraversor(new OuterHtmlVisitor(accum, getOutputSettings())).traverse(this); } // if this node has no document (or parent), retrieve the default output settings private Document.OutputSettings getOutputSettings() { return ownerDocument() != null ? ownerDocument().outputSettings() : (new Document("")).outputSettings(); } /** Get the outer HTML of this node. @param accum accumulator to place HTML into */ abstract void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out); abstract void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out); public String toString() { return outerHtml(); } protected void indent(StringBuilder accum, int depth, Document.OutputSettings out) { accum.append("\n").append(StringUtil.padding(depth * out.indentAmount())); } @Override public

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.helper; /** * Simple validation methods. Designed for jsoup internal use */ public final class Validate { private Validate() {} /** * Validates that the obect is not null * @param obj object to test */ public static void notNull(Object obj) { if (obj == null) throw new IllegalArgumentException("Object must not be null"); } /** * Validates that the object is not null * @param obj object to test * @param msg message to output if validation fails */ public static void notNull(Object obj, String msg) { if (obj == null) throw new IllegalArgumentException(msg); } /** * Validates that the value is true * @param val object to test */ public static void isTrue(boolean val) { if (!val) throw new IllegalArgumentException("Must be true"); } /** * Validates that the value is true * @param val object to test * @param msg message to output if validation fails */ public static void isTrue(boolean val, String msg) { if (!val) throw new IllegalArgumentException(msg); } /** * Validates that the value is false * @param val object to test */ public static void isFalse(boolean val) { if (val) throw new IllegalArgumentException("Must be false"); } /** * Validates that the value is false * @param val object to test * @param msg message to output if validation fails */ public static void isFalse(boolean val, String msg) { if (val) throw new IllegalArgumentException(msg); } /** * Validates that the array contains no null elements * @param objects the array to test */ public static void noNullElements(Object[] objects) { noNullElements(objects, "Array must not contain any null objects"); } /** * Validates that the array contains no null elements * @param objects the array to test * @param msg message to output if validation fails */ public static void noNullElements(Object[] objects, String msg) { for (Object obj : objects) if (obj == null) throw new IllegalArgumentException(msg); } /** * Validates that the string is not empty * @param string the string to test */ public static void notEmpty(String string)

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> { if (string == null || string.length() == 0) throw new IllegalArgumentException("String must not be empty"); } /** * Validates that the string is not empty * @param string the string to test * @param msg message to output if validation fails */ public static void notEmpty(String string, String msg) { if (string == null || string.length() == 0) throw new IllegalArgumentException(msg); } /** Cause a failure. @param msg message to output. */ public static void fail(String msg) { throw new IllegalArgumentException(msg); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.select; import org.jsoup.helper.Validate; import org.jsoup.nodes.Element; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Evaluates that an element matches the selector. */ public abstract class Evaluator { Evaluator() { } /** * Test if the element meets the evaluator's requirements. * * @param root Root of the matching subtree * @param element tested element */ public abstract boolean matches(Element root, Element element); /** * Evaluator for tag name */ public static final class Tag extends Evaluator { private String tagName; public Tag(String tagName) { this.tagName = tagName; } @Override public boolean matches(Element root, Element element) { return (element.tagName().equals(tagName)); } @Override public String toString() { return String.format("%s", tagName); } } /** * Evaluator for element id */ public static final class Id extends Evaluator { private String id; public Id(String id) { this.id = id; } @Override public boolean matches(Element root, Element element) { return (id.equals(element.id())); } @Override public String toString() { return String.format("#%s", id); } } /** * Evaluator for element class */ public static final class Class extends Evaluator { private String className; public Class(String className) { this.className = className; } @Override public boolean matches(Element root, Element element) { return (element.hasClass(className)); } @Override public String toString() { return String.format(".%s", className); } } /** * Evaluator for attibute name matching */ public static final class Attribute extends Evaluator { private String key; public Attribute(String key) { this.key = key; } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key); } @Override public String toString() { return String.format("[%s]", key); } } /** * Evaluator for attribute name prefix matching */ public static final

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> class AttributeStarting extends Evaluator { private String keyPrefix; public AttributeStarting(String keyPrefix) { this.keyPrefix = keyPrefix; } @Override public boolean matches(Element root, Element element) { List<org.jsoup.nodes.Attribute> values = element.attributes().asList(); for (org.jsoup.nodes.Attribute attribute : values) { if (attribute.getKey().startsWith(keyPrefix)) return true; } return false; } @Override public String toString() { return String.format("[^%s]", keyPrefix); } } /** * Evaluator for attribute name/value matching */ public static final class AttributeWithValue extends AttributeKeyPair { public AttributeWithValue(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && value.equalsIgnoreCase(element.attr(key)); } @Override public String toString() { return String.format("[%s=%s]", key, value); } } /** * Evaluator for attribute name != value matching */ public static final class AttributeWithValueNot extends AttributeKeyPair { public AttributeWithValueNot(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return !value.equalsIgnoreCase(element.attr(key)); } @Override public String toString() { return String.format("[%s!=%s]", key, value); } } /** * Evaluator for attribute name/value matching (value prefix) */ public static final class AttributeWithValueStarting extends AttributeKeyPair { public AttributeWithValueStarting(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && element.attr(key).toLowerCase().startsWith(value); // value is lower case already } @Override public String toString() { return String.format("[%s^=%s]", key, value); } } /** * Evaluator for attribute name/value matching (value ending) */ public static final class AttributeWithValueEnding extends AttributeKeyPair { public AttributeWithValueEnding(String key, String value) { super(key, value); }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && element.attr(key).toLowerCase().endsWith(value); // value is lower case } @Override public String toString() { return String.format("[%s$=%s]", key, value); } } /** * Evaluator for attribute name/value matching (value containing) */ public static final class AttributeWithValueContaining extends AttributeKeyPair { public AttributeWithValueContaining(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && element.attr(key).toLowerCase().contains(value); // value is lower case } @Override public String toString() { return String.format("[%s*=%s]", key, value); } } /** * Evaluator for attribute name/value matching (value regex matching) */ public static final class AttributeWithValueMatching extends Evaluator { String key; Pattern pattern; public AttributeWithValueMatching(String key, Pattern pattern) { this.key = key.trim().toLowerCase(); this.pattern = pattern; } @Override public boolean matches(Element root, Element element) { return element.hasAttr(key) && pattern.matcher(element.attr(key)).find(); } @Override public String toString() { return String.format("[%s~=%s]", key, pattern.toString()); } } /** * Abstract evaluator for attribute name/value matching */ public abstract static class AttributeKeyPair extends Evaluator { String key; String value; public AttributeKeyPair(String key, String value) { Validate.notEmpty(key); Validate.notEmpty(value); this.key = key.trim().toLowerCase(); this.value = value.trim().toLowerCase(); } } /** * Evaluator for any / all element matching */ public static final class AllElements extends Evaluator { @Override public boolean matches(Element root, Element element) { return true; } @Override public String toString() { return "*"; } } /** * Evaluator for matching by sibling index number (e < idx) */ public static final class IndexLessThan extends IndexEvaluator { public IndexLessThan(int index) { super(index

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>); } @Override public boolean matches(Element root, Element element) { return element.elementSiblingIndex() < index; } @Override public String toString() { return String.format(":lt(%d)", index); } } /** * Evaluator for matching by sibling index number (e > idx) */ public static final class IndexGreaterThan extends IndexEvaluator { public IndexGreaterThan(int index) { super(index); } @Override public boolean matches(Element root, Element element) { return element.elementSiblingIndex() > index; } @Override public String toString() { return String.format(":gt(%d)", index); } } /** * Evaluator for matching by sibling index number (e = idx) */ public static final class IndexEquals extends IndexEvaluator { public IndexEquals(int index) { super(index); } @Override public boolean matches(Element root, Element element) { return element.elementSiblingIndex() == index; } @Override public String toString() { return String.format(":eq(%d)", index); } } /** * Abstract evaluator for sibling index matching * * @author ant */ public abstract static class IndexEvaluator extends Evaluator { int index; public IndexEvaluator(int index) { this.index = index; } } /** * Evaluator for matching Element (and its descendents) text */ public static final class ContainsText extends Evaluator { private String searchText; public ContainsText(String searchText) { this.searchText = searchText.toLowerCase(); } @Override public boolean matches(Element root, Element element) { return (element.text().toLowerCase().contains(searchText)); } @Override public String toString() { return String.format(":contains(%s", searchText); } } /** * Evaluator for matching Element's own text */ public static final class ContainsOwnText extends Evaluator { private String searchText; public ContainsOwnText(String searchText) { this.searchText = searchText.toLowerCase(); } @Override public boolean matches(Element root, Element element) { return (element.ownText().toLowerCase().contains(searchText)); } @Override public String toString() { return String.format(":containsOwn(%s",

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> searchText); } } /** * Evaluator for matching Element (and its descendents) text with regex */ public static final class Matches extends Evaluator { private Pattern pattern; public Matches(Pattern pattern) { this.pattern = pattern; } @Override public boolean matches(Element root, Element element) { Matcher m = pattern.matcher(element.text()); return m.find(); } @Override public String toString() { return String.format(":matches(%s", pattern); } } /** * Evaluator for matching Element's own text with regex */ public static final class MatchesOwn extends Evaluator { private Pattern pattern; public MatchesOwn(Pattern pattern) { this.pattern = pattern; } @Override public boolean matches(Element root, Element element) { Matcher m = pattern.matcher(element.ownText()); return m.find(); } @Override public String toString() { return String.format(":matchesOwn(%s", pattern); } } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.select; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.jsoup.helper.StringUtil; import org.jsoup.helper.Validate; import org.jsoup.parser.TokenQueue; /** * Parses a CSS selector into an Evaluator tree. */ class QueryParser { private final static String[] combinators = {",", ">", "+", "~", " "}; private TokenQueue tq; private String query; private List<Evaluator> evals = new ArrayList<Evaluator>(); /** * Create a new QueryParser. * @param query CSS query */ private QueryParser(String query) { this.query = query; this.tq = new TokenQueue(query); } /** * Parse a CSS query into an Evaluator. * @param query CSS query * @return Evaluator */ public static Evaluator parse(String query) { QueryParser p = new QueryParser(query); return p.parse(); } /** * Parse the query * @return Evaluator */ Evaluator parse() { tq.consumeWhitespace(); if (tq.matchesAny(combinators)) { // if starts with a combinator, use root as elements evals.add(new StructuralEvaluator.Root()); combinator(tq.consume()); } else { findElements(); } while (!tq.isEmpty()) { // hierarchy and extras boolean seenWhite = tq.consumeWhitespace(); if (tq.matchChomp(",")) { // group or CombiningEvaluator.Or or = new CombiningEvaluator.Or(evals); evals.clear(); evals.add(or); while (!tq.isEmpty()) { String subQuery = tq.chompTo(","); or.add(parse(subQuery)); } } else if (tq.matchesAny(combinators)) { combinator(tq.consume()); } else if (seenWhite) { combinator(' '); } else { // E.class, E#id, E[attr] etc. AND findElements(); // take next el, #. etc off queue } } if (evals.size() == 1) return evals.get(0); return new CombiningEvaluator.And(evals);

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> } private void combinator(char combinator) { tq.consumeWhitespace(); String subQuery = consumeSubQuery(); // support multi > childs Evaluator e; if (evals.size() == 1) e = evals.get(0); else e = new CombiningEvaluator.And(evals); evals.clear(); Evaluator f = parse(subQuery); if (combinator == '>') evals.add(new CombiningEvaluator.And(f, new StructuralEvaluator.ImmediateParent(e))); else if (combinator == ' ') evals.add(new CombiningEvaluator.And(f, new StructuralEvaluator.Parent(e))); else if (combinator == '+') evals.add(new CombiningEvaluator.And(f, new StructuralEvaluator.ImmediatePreviousSibling(e))); else if (combinator == '~') evals.add(new CombiningEvaluator.And(f, new StructuralEvaluator.PreviousSibling(e))); else throw new Selector.SelectorParseException("Unknown combinator: " + combinator); } private String consumeSubQuery() { StringBuilder sq = new StringBuilder(); while (!tq.isEmpty()) { if (tq.matches("(")) sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); else if (tq.matches("[")) sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); else if (tq.matchesAny(combinators)) break; else sq.append(tq.consume()); } return sq.toString(); } private void findElements() { if (tq.matchChomp("#")) byId(); else if (tq.matchChomp(".")) byClass(); else if (tq.matchesWord()) byTag(); else if (tq.matches("[")) byAttribute(); else if (tq.matchChomp("*")) allElements(); else if (tq.matchChomp(":lt(")) indexLessThan(); else if (tq.matchChomp(":gt(")) indexGreaterThan(); else if (tq.matchChomp(":eq(")) indexEquals(); else if (tq.matches(":has(")) has(); else if (tq.matches(":contains(")) contains(false); else if (tq.matches(":containsOwn

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>(")) contains(true); else if (tq.matches(":matches(")) matches(false); else if (tq.matches(":matchesOwn(")) matches(true); else if (tq.matches(":not(")) not(); else // unhandled throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); } private void byId() { String id = tq.consumeCssIdentifier(); Validate.notEmpty(id); evals.add(new Evaluator.Id(id)); } private void byClass() { String className = tq.consumeCssIdentifier(); Validate.notEmpty(className); evals.add(new Evaluator.Class(className.trim().toLowerCase())); } private void byTag() { String tagName = tq.consumeElementSelector(); Validate.notEmpty(tagName); // namespaces: if element name is "abc:def", selector must be "abc|def", so flip: if (tagName.contains("|")) tagName = tagName.replace("|", ":"); evals.add(new Evaluator.Tag(tagName.trim().toLowerCase())); } private void byAttribute() { TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue String key = cq.consumeToAny("=", "!=", "^=", "$=", "*=", "~="); // eq, not, start, end, contain, match, (no val) Validate.notEmpty(key); cq.consumeWhitespace(); if (cq.isEmpty()) { if (key.startsWith("^")) evals.add(new Evaluator.AttributeStarting(key.substring(1))); else evals.add(new Evaluator.Attribute(key)); } else { if (cq.matchChomp("=")) evals.add(new Evaluator.AttributeWithValue(key, cq.remainder())); else if (cq.matchChomp("!=")) evals.add(new Evaluator.AttributeWithValueNot(key, cq.remainder())); else if (cq.matchChomp("^=")) evals.add(new Evaluator.AttributeWithValueStarting(key, cq.remainder())); else if (cq.matchChomp("$=")) evals.add(new Evaluator.AttributeWithValueEnding(key, cq.remainder())); else if (cq

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>.matchChomp("*=")) evals.add(new Evaluator.AttributeWithValueContaining(key, cq.remainder())); else if (cq.matchChomp("~=")) evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()))); else throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); } } private void allElements() { evals.add(new Evaluator.AllElements()); } // pseudo selectors :lt, :gt, :eq private void indexLessThan() { evals.add(new Evaluator.IndexLessThan(consumeIndex())); } private void indexGreaterThan() { evals.add(new Evaluator.IndexGreaterThan(consumeIndex())); } private void indexEquals() { evals.add(new Evaluator.IndexEquals(consumeIndex())); } private int consumeIndex() { String indexS = tq.chompTo(")").trim(); Validate.isTrue(StringUtil.isNumeric(indexS), "Index must be numeric"); return Integer.parseInt(indexS); } // pseudo selector :has(el) private void has() { tq.consume(":has"); String subQuery = tq.chompBalanced('(', ')'); Validate.notEmpty(subQuery, ":has(el) subselect must not be empty"); evals.add(new StructuralEvaluator.Has(parse(subQuery))); } // pseudo selector :contains(text), containsOwn(text) private void contains(boolean own) { tq.consume(own ? ":containsOwn" : ":contains"); String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); Validate.notEmpty(searchText, ":contains(text) query must not be empty"); if (own) evals.add(new Evaluator.ContainsOwnText(searchText)); else evals.add(new Evaluator.ContainsText(searchText)); } // :matches(regex), matchesOwn(regex) private void matches(boolean own) { tq.consume(own ? ":matchesOwn" : ":matches"); String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped Validate.notEmpty(regex

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>, ":matches(regex) query must not be empty"); if (own) evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex))); else evals.add(new Evaluator.Matches(Pattern.compile(regex))); } // :not(selector) private void not() { tq.consume(":not"); String subQuery = tq.chompBalanced('(', ')'); Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); evals.add(new StructuralEvaluator.Not(parse(subQuery))); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>><td><code>:matches(<em>regex</em>)</code></td><td>elements whose text matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr> * <tr><td><code>:containsOwn(<em>text</em>)</code></td><td>elements that directly contains the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.</td><td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup".</td></tr> * <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr> * <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr> * </table> * * @author Jonathan Hedley, jonathan@hedley.net * @see Element#select(String) */ public class Selector { private final Evaluator evaluator; private final Element root; private Selector(String query, Element root) { Validate.notNull(query); query = query.trim(); Validate.notEmpty(query); Validate.notNull(root); this.evaluator = QueryParser.parse(query); this.root = root; } /** * Find elements matching selector. * * @param query CSS selector * @param root root element to descend into * @return matching elements, empty if not */ public static Elements select(String query, Element root) { return new Selector(query, root).select(); } /** * Find elements matching selector. * * @param query CSS

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> selector * @param roots root elements to descend into * @return matching elements, empty if not */ public static Elements select(String query, Iterable<Element> roots) { Validate.notEmpty(query); Validate.notNull(roots); LinkedHashSet<Element> elements = new LinkedHashSet<Element>(); for (Element root : roots) { elements.addAll(select(query, root)); } return new Elements(elements); } private Elements select() { return Collector.collect(evaluator, root); } // exclude set. package open so that Elements can implement .not() selector. static Elements filterOut(Collection<Element> elements, Collection<Element> outs) { Elements output = new Elements(); for (Element el : elements) { boolean found = false; for (Element out : outs) { if (el.equals(out)) { found = true; break; } } if (!found) output.add(el); } return output; } public static class SelectorParseException extends IllegalStateException { public SelectorParseException(String msg, Object... params) { super(String.format(msg, params)); } } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; import java.nio.charset.CharsetEncoder; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * HTML entities, and escape routines. * Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML * named character references</a>. */ public class Entities { public enum EscapeMode { /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */ xhtml(xhtmlByVal), /** Default HTML output entities. */ base(baseByVal), /** Complete HTML entities. */ extended(fullByVal); private Map<Character, String> map; EscapeMode(Map<Character, String> map) { this.map = map; } public Map<Character, String> getMap() { return map; } } private static final Map<String, Character> full; private static final Map<Character, String> xhtmlByVal; private static final Map<Character, String> baseByVal; private static final Map<Character, String> fullByVal; private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); private Entities() {} /** * Check if the input is a known named entity * @param name the possible entity name (e.g. "lt" or "amp" * @return true if a known named entity */ public static boolean isNamedEntity(String name) { return full.containsKey(name); } /** * Get the Character value of the named entity * @param name named entity (e.g. "lt" or "amp") * @return the Character value of the named entity (e.g. '<' or '&') */ public static Character getCharacterByName(String name) { return full.get(name); } static String escape(String string, Document.Output

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>Settings out) { return escape(string, out.encoder(), out.escapeMode()); } static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) { StringBuilder accum = new StringBuilder(string.length() * 2); Map<Character, String> map = escapeMode.getMap(); for (int pos = 0; pos < string.length(); pos++) { Character c = string.charAt(pos); if (map.containsKey(c)) accum.append('&').append(map.get(c)).append(';'); else if (encoder.canEncode(c)) accum.append(c.charValue()); else accum.append("&#").append((int) c).append(';'); } return accum.toString(); } static String unescape(String string) { return unescape(string, false); } /** * Unescape the input string. * @param string * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) * @return */ static String unescape(String string, boolean strict) { // todo: change this method to use Tokeniser.consumeCharacterReference if (!string.contains("&")) return string; Matcher m = strict? strictUnescapePattern.matcher(string) : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? StringBuffer accum = new StringBuffer(string.length()); // pity matcher can't use stringbuilder, avoid syncs // todo: replace m.appendReplacement with own impl, so StringBuilder and quoteReplacement not required while (m.find()) { int charval = -1; String num = m.group(3); if (num != null) { try { int base = m.group(2) != null ? 16 : 10; // 2 is hex indicator charval = Integer.valueOf(num, base); } catch (NumberFormatException e) { } // skip } else { String name = m.group(1); if (full.containsKey(name)) charval = full.get(name); } if (charval != -1 || charval > 0xFFFF) { // out of range String c = Character.toString((char) charval

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> 0x0017C}, {"zeetrf", 0x02128}, {"zeta", 0x003B6}, {"zfr", 0x1D537}, {"zhcy", 0x00436}, {"zigrarr", 0x021DD}, {"zopf", 0x1D56B}, {"zscr", 0x1D4CF}, {"zwj", 0x0200D}, {"zwnj", 0x0200C} }; static { full = new HashMap<String, Character>(fullArray.length); xhtmlByVal = new HashMap<Character, String>(xhtmlArray.length); baseByVal = new HashMap<Character, String>(baseArray.length); fullByVal = new HashMap<Character, String>(fullArray.length); for (Object[] entity : xhtmlArray) { Character c = Character.valueOf((char) ((Integer) entity[1]).intValue()); xhtmlByVal.put(c, ((String) entity[0])); } for (Object[] entity : baseArray) { Character c = Character.valueOf((char) ((Integer) entity[1]).intValue()); baseByVal.put(c, ((String) entity[0])); } for (Object[] entity : fullArray) { Character c = Character.valueOf((char) ((Integer) entity[1]).intValue()); full.put((String) entity[0], c); fullByVal.put(c, ((String) entity[0])); } } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; import org.jsoup.helper.Validate; import java.util.HashMap; import java.util.Map; /** * HTML Tag capabilities. * * @author Jonathan Hedley, jonathan@hedley.net */ public class Tag { private static final Map<String, Tag> tags = new HashMap<String, Tag>(); // map of known tags private String tagName; private boolean isBlock = true; // block or inline private boolean formatAsBlock = true; // should be formatted as a block private boolean canContainBlock = true; // Can this tag hold block level tags? private boolean canContainInline = true; // only pcdata if not private boolean empty = false; // can hold nothing; e.g. img private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty. private boolean preserveWhitespace = false; // for pre, textarea, script etc private Tag(String tagName) { this.tagName = tagName.toLowerCase(); } /** * Get this tag's name. * * @return the tag's name */ public String getName() { return tagName; } /** * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. * <p/> * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). * * @param tagName Name of tag, e.g. "p". Case insensitive. * @return The tag, either defined or new generic. */ public static Tag valueOf(String tagName) { Validate.notNull(tagName); tagName = tagName.trim().toLowerCase(); Validate.notEmpty(tagName); synchronized (tags) { Tag tag = tags.get(tagName); if (tag == null) { // not defined: create default; go anywhere, do anything! (incl be inside a <p>) tag = new Tag(tagName); tag.isBlock = false; tag.canContainBlock = true; } return tag; } } /** * Gets if this is a block tag. * * @return if block tag */ public boolean isBlock() { return isBlock; } /** * Gets

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> if this tag should be formatted as a block (or as inline) * * @return if should be formatted as block or inline */ public boolean formatAsBlock() { return formatAsBlock; } /** * Gets if this tag can contain block tags. * * @return if tag can contain block tags */ public boolean canContainBlock() { return canContainBlock; } /** * Gets if this tag is an inline tag. * * @return if this tag is an inline tag. */ public boolean isInline() { return !isBlock; } /** * Gets if this tag is a data only tag. * * @return if this tag is a data only tag */ public boolean isData() { return !canContainInline && !isEmpty(); } /** * Get if this is an empty tag * * @return if this is an emtpy tag */ public boolean isEmpty() { return empty; } /** * Get if this tag is self closing. * * @return if this tag should be output as self closing. */ public boolean isSelfClosing() { return empty || selfClosing; } /** * Get if this is a pre-defined tag, or was auto created on parsing. * * @return if a known tag */ public boolean isKnownTag() { return tags.containsKey(tagName); } /** * Check if this tagname is a known tag. * * @param tagName name of tag * @return if known HTML tag */ public static boolean isKnownTag(String tagName) { return tags.containsKey(tagName); } /** * Get if this tag should preserve whitespace within child text nodes. * * @return if preserve whitepace */ public boolean preserveWhitespace() { return preserveWhitespace; } Tag setSelfClosing() { selfClosing = true; return this; } @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Tag)) return false; Tag tag = (Tag) o; if (canContainBlock != tag.canContainBlock) return false; if (canContainInline != tag.canContainInline) return false; if (empty != tag.empty) return false; if (formatAsBlock != tag.formatAs

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>Block) return false; if (isBlock != tag.isBlock) return false; if (preserveWhitespace != tag.preserveWhitespace) return false; if (selfClosing != tag.selfClosing) return false; if (!tagName.equals(tag.tagName)) return false; return true; } @Override public int hashCode() { int result = tagName.hashCode(); result = 31 * result + (isBlock ? 1 : 0); result = 31 * result + (formatAsBlock ? 1 : 0); result = 31 * result + (canContainBlock ? 1 : 0); result = 31 * result + (canContainInline ? 1 : 0); result = 31 * result + (empty ? 1 : 0); result = 31 * result + (selfClosing ? 1 : 0); result = 31 * result + (preserveWhitespace ? 1 : 0); return result; } public String toString() { return tagName; } // internal static initialisers: // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources private static final String[] blockTags = { "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", "td", "video", "audio", "canvas", "details", "menu", "plaintext" }; private static final String[] inlineTags = { "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>", "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q", "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup", "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", "summary", "command", "device" }; private static final String[] emptyTags = { "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", "device" }; private static final String[] formatAsInlineTags = { "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td" }; private static final String[] preserveWhitespaceTags = {"pre", "plaintext", "title"}; static { // creates for (String tagName : blockTags) { Tag tag = new Tag(tagName); register(tag); } for (String tagName : inlineTags) { Tag tag = new Tag(tagName); tag.isBlock = false; tag.canContainBlock = false; tag.formatAsBlock = false; register(tag); } // mods: for (String tagName : emptyTags) { Tag tag = tags.get(tagName); Validate.notNull(tag); tag.canContainBlock = false; tag.canContainInline = false; tag.empty = true; } for (String tagName : formatAsInlineTags) { Tag tag = tags.get(tagName); Validate.notNull(tag); tag.formatAsBlock = false; } for (String tagName : preserveWhitespaceTags) { Tag tag = tags.get(tagName); Validate.notNull(tag); tag.preserveWhitespace = true; } } private static Tag register(Tag tag) { synchronized (tags) { tags.put(tag.tagName, tag); } return tag;

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; import org.jsoup.helper.StringUtil; import org.jsoup.helper.Validate; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.select.Collector; import org.jsoup.select.Elements; import org.jsoup.select.Evaluator; import org.jsoup.select.Selector; import java.util.*; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /** * A HTML element consists of a tag name, attributes, and child nodes (including text nodes and * other elements). * * From an Element, you can extract data, traverse the node graph, and manipulate the HTML. * * @author Jonathan Hedley, jonathan@hedley.net */ public class Element extends Node { private Tag tag; private Set<String> classNames; /** * Create a new, standalone Element. (Standalone in that is has no parent.) * * @param tag tag of this element * @param baseUri the base URI * @param attributes initial attributes * @see #appendChild(Node) * @see #appendElement(String) */ public Element(Tag tag, String baseUri, Attributes attributes) { super(baseUri, attributes); Validate.notNull(tag); this.tag = tag; } /** * Create a new Element from a tag and a base URI. * * @param tag element tag * @param baseUri the base URI of this element. It is acceptable for the base URI to be an empty * string, but not null. * @see Tag#valueOf(String) */ public Element(Tag tag, String baseUri) { this(tag, baseUri, new Attributes()); } @Override public String nodeName() { return tag.getName(); } /** * Get the name of the tag for this element. E.g. {@code div} * * @return the tag name */ public String tagName() { return tag.getName(); } /** * Change the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with * {@code el.tagName("div");}. * * @param tagName new tag

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> name for this element * @return this element, for chaining */ public Element tagName(String tagName) { Validate.notEmpty(tagName, "Tag name must not be empty."); tag = Tag.valueOf(tagName); return this; } /** * Get the Tag for this element. * * @return the tag object */ public Tag tag() { return tag; } /** * Test if this element is a block-level element. (E.g. {@code <div> == true} or an inline element * {@code <p> == false}). * * @return true if block, false if not (and thus inline) */ public boolean isBlock() { return tag.isBlock(); } /** * Get the {@code id} attribute of this element. * * @return The id attribute, if present, or an empty string if not. */ public String id() { String id = attr("id"); return id == null ? "" : id; } /** * Set an attribute value on this element. If this element already has an attribute with the * key, its value is updated; otherwise, a new attribute is added. * * @return this element */ public Element attr(String attributeKey, String attributeValue) { super.attr(attributeKey, attributeValue); return this; } /** * Get this element's HTML5 custom data attributes. Each attribute in the element that has a key * starting with "data-" is included the dataset. * <p> * E.g., the element {@code <div data-package="jsoup" data-language="Java" class="group">...} has the dataset * {@code package=jsoup, language=java}. * <p> * This map is a filtered view of the element's attribute map. Changes to one map (add, remove, update) are reflected * in the other map. * <p> * You can find elements that have data attributes using the {@code [^data-]} attribute key prefix selector. * @return a map of {@code key=value} custom data attributes. */ public Map<String, String> dataset() { return attributes.dataset(); } @Override public final Element parent() { return (Element) parentNode; } /** * Get this element's parent and

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> (loosely) * </ul> * <p/> * See the query syntax documentation in {@link org.jsoup.select.Selector}. * * @param query a {@link Selector} query * @return elements that match the query (empty if none match) * @see org.jsoup.select.Selector */ public Elements select(String query) { return Selector.select(query, this); } /** * Add a node child node to this element. * * @param child node to add. Must not already have a parent. * @return this element, so that you can add more child nodes or elements. */ public Element appendChild(Node child) { Validate.notNull(child); addChildren(child); return this; } /** * Add a node to the start of this element's children. * * @param child node to add. Must not already have a parent. * @return this element, so that you can add more child nodes or elements. */ public Element prependChild(Node child) { Validate.notNull(child); addChildren(0, child); return this; } /** * Create a new element by tag name, and add it as the last child. * * @param tagName the name of the tag (e.g. {@code div}). * @return the new element, to allow you to add content to it, e.g.: * {@code parent.appendElement("h1").attr("id", "header").text("Welcome");} */ public Element appendElement(String tagName) { Element child = new Element(Tag.valueOf(tagName), baseUri()); appendChild(child); return child; } /** * Create a new element by tag name, and add it as the first child. * * @param tagName the name of the tag (e.g. {@code div}). * @return the new element, to allow you to add content to it, e.g.: * {@code parent.prependElement("h1").attr("id", "header").text("Welcome");} */ public Element prependElement(String tagName) { Element child = new Element(Tag.valueOf(tagName), baseUri()); prependChild(child); return child; } /** * Create and append a new TextNode to this element. *

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> * @param text the unencoded text to add * @return this element */ public Element appendText(String text) { TextNode node = new TextNode(text, baseUri()); appendChild(node); return this; } /** * Create and prepend a new TextNode to this element. * * @param text the unencoded text to add * @return this element */ public Element prependText(String text) { TextNode node = new TextNode(text, baseUri()); prependChild(node); return this; } /** * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children. * @param html HTML to add inside this element, after the existing HTML * @return this element * @see #html(String) */ public Element append(String html) { Validate.notNull(html); List<Node> nodes = Parser.parseFragment(html, this, baseUri()); addChildren(nodes.toArray(new Node[nodes.size()])); return this; } /** * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children. * @param html HTML to add inside this element, before the existing HTML * @return this element * @see #html(String) */ public Element prepend(String html) { Validate.notNull(html); List<Node> nodes = Parser.parseFragment(html, this, baseUri()); addChildren(0, nodes.toArray(new Node[nodes.size()])); return this; } /** * Insert the specified HTML into the DOM before this element (i.e. as a preceeding sibling). * * @param html HTML to add before this element * @return this element, for chaining * @see #after(String) */ @Override public Element before(String html) { return (Element) super.before(html); } /** * Insert the specified node into the DOM before this node (i.e. as a preceeding sibling). * @param node to add before this element * @return this Element, for chaining * @see #after(Node) */ @Override public Element before(Node node) { return (Element) super.before(node); }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> /** * Insert the specified HTML into the DOM after this element (i.e. as a following sibling). * * @param html HTML to add after this element * @return this element, for chaining * @see #before(String) */ @Override public Element after(String html) { return (Element) super.after(html); } /** * Insert the specified node into the DOM after this node (i.e. as a following sibling). * @param node to add after this element * @return this element, for chaining * @see #before(Node) */ @Override public Element after(Node node) { return (Element) super.after(node); } /** * Remove all of the element's child nodes. Any attributes are left as-is. * @return this element */ public Element empty() { childNodes.clear(); return this; } /** * Wrap the supplied HTML around this element. * * @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. * @return this element, for chaining. */ @Override public Element wrap(String html) { return (Element) super.wrap(html); } /** * Get sibling elements. * @return sibling elements */ public Elements siblingElements() { return parent().children(); } /** * Gets the next sibling element of this element. E.g., if a {@code div} contains two {@code p}s, * the {@code nextElementSibling} of the first {@code p} is the second {@code p}. * <p/> * This is similar to {@link #nextSibling()}, but specifically finds only Elements * @return the next element, or null if there is no next element * @see #previousElementSibling() */ public Element nextElementSibling() { List<Element> siblings = parent().children(); Integer index = indexInList(this, siblings); Validate.notNull(index); if (siblings.size() > index+1) return siblings.get(index+1); else return null; } /** * Gets the previous element sibling of this element. * @return the previous element, or null if there is no previous element * @see #nextElement

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>Sibling() */ public Element previousElementSibling() { List<Element> siblings = parent().children(); Integer index = indexInList(this, siblings); Validate.notNull(index); if (index > 0) return siblings.get(index-1); else return null; } /** * Gets the first element sibling of this element. * @return the first sibling that is an element (aka the parent's first element child) */ public Element firstElementSibling() { // todo: should firstSibling() exclude this? List<Element> siblings = parent().children(); return siblings.size() > 1 ? siblings.get(0) : null; } /** * Get the list index of this element in its element sibling list. I.e. if this is the first element * sibling, returns 0. * @return position in element sibling list */ public Integer elementSiblingIndex() { if (parent() == null) return 0; return indexInList(this, parent().children()); } /** * Gets the last element sibling of this element * @return the last sibling that is an element (aka the parent's last element child) */ public Element lastElementSibling() { List<Element> siblings = parent().children(); return siblings.size() > 1 ? siblings.get(siblings.size() - 1) : null; } private static <E extends Element> Integer indexInList(Element search, List<E> elements) { Validate.notNull(search); Validate.notNull(elements); for (int i = 0; i < elements.size(); i++) { E element = elements.get(i); if (element.equals(search)) return i; } return null; } // DOM type methods /** * Finds elements, including and recursively under this element, with the specified tag name. * @param tagName The tag name to search for (case insensitively). * @return a matching unmodifiable list of elements. Will be empty if this element and none of its children match. */ public Elements getElementsByTag(String tagName) { Validate.notEmpty(tagName); tagName = tagName.toLowerCase().trim(); return Collector.collect(new Evaluator.Tag(tagName), this); } /** * Find an element

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> by ID, including or under this element. * <p> * Note that this finds the first matching ID, starting with this element. If you search down from a different * starting point, it is possible to find a different element by ID. For unique element by ID within a Document, * use {@link Document#getElementById(String)} * @param id The ID to search for. * @return The first matching element by ID, starting with this element, or null if none found. */ public Element getElementById(String id) { Validate.notEmpty(id); Elements elements = Collector.collect(new Evaluator.Id(id), this); if (elements.size() > 0) return elements.get(0); else return null; } /** * Find elements that have this class, including or under this element. Case insensitive. * <p> * Elements can have multiple classes (e.g. {@code <div class="header round first">}. This method * checks each class, so you can find the above with {@code el.getElementsByClass("header");}. * * @param className the name of the class to search for. * @return elements with the supplied class name, empty if none * @see #hasClass(String) * @see #classNames() */ public Elements getElementsByClass(String className) { Validate.notEmpty(className); return Collector.collect(new Evaluator.Class(className), this); } /** * Find elements that have a named attribute set. Case insensitive. * * @param key name of the attribute, e.g. {@code href} * @return elements that have this attribute, empty if none */ public Elements getElementsByAttribute(String key) { Validate.notEmpty(key); key = key.trim().toLowerCase(); return Collector.collect(new Evaluator.Attribute(key), this); } /** * Find elements that have an attribute name starting with the supplied prefix. Use {@code data-} to find elements * that have HTML5 datasets. * @param keyPrefix name prefix of the attribute e.g. {@code data-} * @return elements that have attribute names that start with with the prefix, empty if none. */ public Elements getElementsByAttributeStarting(String keyPrefix) { Validate.notEmpty(keyPrefix); keyPrefix

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> = keyPrefix.trim().toLowerCase(); return Collector.collect(new Evaluator.AttributeStarting(keyPrefix), this); } /** * Find elements that have an attribute with the specific value. Case insensitive. * * @param key name of the attribute * @param value value of the attribute * @return elements that have this attribute with this value, empty if none */ public Elements getElementsByAttributeValue(String key, String value) { return Collector.collect(new Evaluator.AttributeWithValue(key, value), this); } /** * Find elements that either do not have this attribute, or have it with a different value. Case insensitive. * * @param key name of the attribute * @param value value of the attribute * @return elements that do not have a matching attribute */ public Elements getElementsByAttributeValueNot(String key, String value) { return Collector.collect(new Evaluator.AttributeWithValueNot(key, value), this); } /** * Find elements that have attributes that start with the value prefix. Case insensitive. * * @param key name of the attribute * @param valuePrefix start of attribute value * @return elements that have attributes that start with the value prefix */ public Elements getElementsByAttributeValueStarting(String key, String valuePrefix) { return Collector.collect(new Evaluator.AttributeWithValueStarting(key, valuePrefix), this); } /** * Find elements that have attributes that end with the value suffix. Case insensitive. * * @param key name of the attribute * @param valueSuffix end of the attribute value * @return elements that have attributes that end with the value suffix */ public Elements getElementsByAttributeValueEnding(String key, String valueSuffix) { return Collector.collect(new Evaluator.AttributeWithValueEnding(key, valueSuffix), this); } /** * Find elements that have attributes whose value contains the match string. Case insensitive. * * @param key name of the attribute * @param match substring of value to search for * @return elements that have attributes containing this text */ public Elements getElementsByAttributeValueContaining(String key, String match) { return Collector.collect(new Evaluator.AttributeWithValueContaining(key, match), this); } /** * Find elements that have attributes whose values match the supplied regular expression. * @param key name of the

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> attribute * @param pattern compiled regular expression to match against attribute values * @return elements that have attributes matching this regular expression */ public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) { return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this); } /** * Find elements that have attributes whose values match the supplied regular expression. * @param key name of the attribute * @param regex regular expression to match agaisnt attribute values. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options. * @return elements that have attributes matching this regular expression */ public Elements getElementsByAttributeValueMatching(String key, String regex) { Pattern pattern; try { pattern = Pattern.compile(regex); } catch (PatternSyntaxException e) { throw new IllegalArgumentException("Pattern syntax error: " + regex, e); } return getElementsByAttributeValueMatching(key, pattern); } /** * Find elements whose sibling index is less than the supplied index. * @param index 0-based index * @return elements less than index */ public Elements getElementsByIndexLessThan(int index) { return Collector.collect(new Evaluator.IndexLessThan(index), this); } /** * Find elements whose sibling index is greater than the supplied index. * @param index 0-based index * @return elements greater than index */ public Elements getElementsByIndexGreaterThan(int index) { return Collector.collect(new Evaluator.IndexGreaterThan(index), this); } /** * Find elements whose sibling index is equal to the supplied index. * @param index 0-based index * @return elements equal to index */ public Elements getElementsByIndexEquals(int index) { return Collector.collect(new Evaluator.IndexEquals(index), this); } /** * Find elements that contain the specified string. The search is case insensitive. The text may appear directly * in the element, or in any of its descendants. * @param searchText to look for in the element's text * @return elements that contain the string, case insensitive. * @see Element#text() */ public Elements getElementsContainingText(String search

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>Text) { return Collector.collect(new Evaluator.ContainsText(searchText), this); } /** * Find elements that directly contain the specified string. The search is case insensitive. The text must appear directly * in the element, not in any of its descendants. * @param searchText to look for in the element's own text * @return elements that contain the string, case insensitive. * @see Element#ownText() */ public Elements getElementsContainingOwnText(String searchText) { return Collector.collect(new Evaluator.ContainsOwnText(searchText), this); } /** * Find elements whose text matches the supplied regular expression. * @param pattern regular expression to match text against * @return elements matching the supplied regular expression. * @see Element#text() */ public Elements getElementsMatchingText(Pattern pattern) { return Collector.collect(new Evaluator.Matches(pattern), this); } /** * Find elements whose text matches the supplied regular expression. * @param regex regular expression to match text against. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options. * @return elements matching the supplied regular expression. * @see Element#text() */ public Elements getElementsMatchingText(String regex) { Pattern pattern; try { pattern = Pattern.compile(regex); } catch (PatternSyntaxException e) { throw new IllegalArgumentException("Pattern syntax error: " + regex, e); } return getElementsMatchingText(pattern); } /** * Find elements whose own text matches the supplied regular expression. * @param pattern regular expression to match text against * @return elements matching the supplied regular expression. * @see Element#ownText() */ public Elements getElementsMatchingOwnText(Pattern pattern) { return Collector.collect(new Evaluator.MatchesOwn(pattern), this); } /** * Find elements whose text matches the supplied regular expression. * @param regex regular expression to match text against. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> options. * @return elements matching the supplied regular expression. * @see Element#ownText() */ public Elements getElementsMatchingOwnText(String regex) { Pattern pattern; try { pattern = Pattern.compile(regex); } catch (PatternSyntaxException e) { throw new IllegalArgumentException("Pattern syntax error: " + regex, e); } return getElementsMatchingOwnText(pattern); } /** * Find all elements under this element (including self, and children of children). * * @return all elements */ public Elements getAllElements() { return Collector.collect(new Evaluator.AllElements(), this); } /** * Gets the combined text of this element and all its children. * <p> * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, {@code p.text()} returns {@code "Hello there now!"} * * @return unencoded text, or empty string if none. * @see #ownText() */ public String text() { StringBuilder sb = new StringBuilder(); text(sb); return sb.toString().trim(); } private void text(StringBuilder accum) { appendWhitespaceIfBr(this, accum); for (Node child : childNodes) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; appendNormalisedText(accum, textNode); } else if (child instanceof Element) { Element element = (Element) child; if (accum.length() > 0 && element.isBlock() && !TextNode.lastCharIsWhitespace(accum)) accum.append(" "); element.text(accum); } } } /** * Gets the text owned by this element only; does not get the combined text of all children. * <p> * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, {@code p.ownText()} returns {@code "Hello now!"}, * whereas {@code p.text()} returns {@code "Hello there now!"}. * Note that the text within the {@code b} element is not returned, as it is not a direct child of the {@code p} element. * * @return unencoded text, or empty string if none. * @see #text() */

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> public String ownText() { StringBuilder sb = new StringBuilder(); ownText(sb); return sb.toString().trim(); } private void ownText(StringBuilder accum) { for (Node child : childNodes) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; appendNormalisedText(accum, textNode); } else if (child instanceof Element) { appendWhitespaceIfBr((Element) child, accum); } } } private void appendNormalisedText(StringBuilder accum, TextNode textNode) { String text = textNode.getWholeText(); if (!preserveWhitespace()) { text = TextNode.normaliseWhitespace(text); if (TextNode.lastCharIsWhitespace(accum)) text = TextNode.stripLeadingWhitespace(text); } accum.append(text); } private static void appendWhitespaceIfBr(Element element, StringBuilder accum) { if (element.tag.getName().equals("br") && !TextNode.lastCharIsWhitespace(accum)) accum.append(" "); } boolean preserveWhitespace() { return tag.preserveWhitespace() || parent() != null && parent().preserveWhitespace(); } /** * Set the text of this element. Any existing contents (text or elements) will be cleared * @param text unencoded text * @return this element */ public Element text(String text) { Validate.notNull(text); empty(); TextNode textNode = new TextNode(text, baseUri); appendChild(textNode); return this; } /** Test if this element has any text content (that is not just whitespace). @return true if element has non-blank text content. */ public boolean hasText() { for (Node child: childNodes) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; if (!textNode.isBlank()) return true; } else if (child instanceof Element) { Element el = (Element) child; if (el.hasText()) return true; } } return false; } /** * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag. * @return the data, or empty string if none */ public String data() { StringBuilder sb = new StringBuilder();

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> for (Node childNode : childNodes) { if (childNode instanceof DataNode) { DataNode data = (DataNode) childNode; sb.append(data.getWholeData()); } else if (childNode instanceof Element) { Element element = (Element) childNode; String elementData = element.data(); sb.append(elementData); } } return sb.toString(); } /** * Gets the literal value of this element's "class" attribute, which may include multiple class names, space * separated. (E.g. on <code>&lt;div class="header gray"></code> returns, "<code>header gray</code>") * @return The literal class attribute, or <b>empty string</b> if no class attribute set. */ public String className() { return attr("class"); } /** * Get all of the element's class names. E.g. on element {@code <div class="header gray"}>}, * returns a set of two elements {@code "header", "gray"}. Note that modifications to this set are not pushed to * the backing {@code class} attribute; use the {@link #classNames(java.util.Set)} method to persist them. * @return set of classnames, empty if no class attribute */ public Set<String> classNames() { if (classNames == null) { String[] names = className().split("\\s+"); classNames = new LinkedHashSet<String>(Arrays.asList(names)); } return classNames; } /** Set the element's {@code class} attribute to the supplied class names. @param classNames set of classes @return this element, for chaining */ public Element classNames(Set<String> classNames) { Validate.notNull(classNames); attributes.put("class", StringUtil.join(classNames, " ")); return this; } /** * Tests if this element has a class. Case insensitive. * @param className name of class to check for * @return true if it does, false if not */ public boolean hasClass(String className) { Set<String> classNames = classNames(); for (String name : classNames) { if (className.equalsIgnoreCase(name)) return true; } return false; } /** Add a class name to this element's {@code class} attribute. @

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>param className class name to add @return this element */ public Element addClass(String className) { Validate.notNull(className); Set<String> classes = classNames(); classes.add(className); classNames(classes); return this; } /** Remove a class name from this element's {@code class} attribute. @param className class name to remove @return this element */ public Element removeClass(String className) { Validate.notNull(className); Set<String> classes = classNames(); classes.remove(className); classNames(classes); return this; } /** Toggle a class name on this element's {@code class} attribute: if present, remove it; otherwise add it. @param className class name to toggle @return this element */ public Element toggleClass(String className) { Validate.notNull(className); Set<String> classes = classNames(); if (classes.contains(className)) classes.remove(className); else classes.add(className); classNames(classes); return this; } /** * Get the value of a form element (input, textarea, etc). * @return the value of the form element, or empty string if not set. */ public String val() { if (tagName().equals("textarea")) return text(); else return attr("value"); } /** * Set the value of a form element (input, textarea, etc). * @param value value to set * @return this element (for chaining) */ public Element val(String value) { if (tagName().equals("textarea")) text(value); else attr("value", value); return this; } void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { if (accum.length() > 0 && out.prettyPrint() && (tag.formatAsBlock() || (parent() != null && parent().tag().formatAsBlock()))) indent(accum, depth, out); accum .append("<") .append(tagName()); attributes.html(accum, out); if (childNodes.isEmpty() && tag.isSelfClosing()) accum.append(" />"); else accum.append(">"); } void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) { if (!(childNodes.isEmpty() && tag.isSelfClosing

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>())) { if (out.prettyPrint() && !childNodes.isEmpty() && tag.formatAsBlock()) indent(accum, depth, out); accum.append("</").append(tagName()).append(">"); } } /** * Retrieves the element's inner HTML. E.g. on a {@code <div>} with one empty {@code <p>}, would return * {@code <p></p>}. (Whereas {@link #outerHtml()} would return {@code <div><p></p></div>}.) * * @return String of HTML. * @see #outerHtml() */ public String html() { StringBuilder accum = new StringBuilder(); html(accum); return accum.toString().trim(); } private void html(StringBuilder accum) { for (Node node : childNodes) node.outerHtml(accum); } /** * Set this element's inner HTML. Clears the existing HTML first. * @param html HTML to parse and set into this element * @return this element * @see #append(String) */ public Element html(String html) { empty(); append(html); return this; } public String toString() { return outerHtml(); } @Override public boolean equals(Object o) { return this == o; } @Override public int hashCode() { // todo: fixup, not very useful int result = super.hashCode(); result = 31 * result + (tag != null ? tag.hashCode() : 0); return result; } @Override public Element clone() { Element clone = (Element) super.clone(); clone.classNames(); // creates linked set of class names from class attribute return clone; } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import java.util.List; /** * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods * in {@link org.jsoup.Jsoup}. */ public class Parser { /** * Parse HTML into a Document. * * @param html HTML to parse * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * * @return parsed Document */ public static Document parse(String html, String baseUri) { TreeBuilder treeBuilder = new TreeBuilder(); return treeBuilder.parse(html, baseUri); } /** * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. * * @param fragmentHtml the fragment of HTML to parse * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This * provides stack context (for implicit element creation). * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modifed. */ public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { TreeBuilder treeBuilder = new TreeBuilder(); return treeBuilder.parseFragment(fragmentHtml, context, baseUri); } /** * Parse a fragment of HTML into the {@code body} of a Document. * * @param bodyHtml fragment of HTML * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * * @return Document, with empty head, and HTML parsed into body */ public static Document parseBodyFragment(String bodyHtml, String baseUri) { Document doc = Document.createShell(baseUri); Element body = doc.body(); List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> when re-parented for (Node node : nodes) { body.appendChild(node); } return doc; } /** * @param bodyHtml HTML to parse * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. * * @return parsed Document * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead. */ public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) { return parse(bodyHtml, baseUri); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.parser; import org.jsoup.helper.StringUtil; import org.jsoup.helper.Validate; import org.jsoup.nodes.*; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; /** * HTML Tree Builder; creates a DOM from Tokens. */ class TreeBuilder { CharacterReader reader; Tokeniser tokeniser; private TreeBuilderState state; // the current state private TreeBuilderState originalState; // original / marked state private Document doc; // current doc we are building into private LinkedList<Element> stack; // the stack of open elements private String baseUri; // current base uri, for creating new elements private Token currentToken; // currentToken is used only for error tracking. private Element headElement; // the current head element private Element formElement; // the current form element private Element contextElement; // fragment parse context -- could be null even if fragment parsing private LinkedList<Element> formattingElements = new LinkedList<Element>(); // active (open) formatting elements private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars in table to be shifted out private boolean framesetOk = true; // if ok to go into frameset private boolean fosterInserts = false; // if next inserts should be fostered private boolean fragmentParsing = false; // if parsing a fragment of html private boolean trackErrors = false; private List<ParseError> errors = new ArrayList<ParseError>(); TreeBuilder() {} private void initialiseParse(String input, String baseUri) { doc = new Document(baseUri); reader = new CharacterReader(input); tokeniser = new Tokeniser(reader); stack = new LinkedList<Element>(); this.baseUri = baseUri; } Document parse(String input, String baseUri) { state = TreeBuilderState.Initial; initialiseParse(input, baseUri); runParser(); return doc; } List<Node> parseFragment(String inputFragment, Element context, String baseUri) { // context may be null initialiseParse(inputFragment, baseUri); contextElement = context; fragmentParsing = true; Element root = null; if (context != null) { if (context.ownerDocument() != null) // quirks setup: doc.

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>quirksMode(context.ownerDocument().quirksMode()); // initialise the tokeniser state: String contextTag = context.tagName(); if (StringUtil.in(contextTag, "title", "textarea")) tokeniser.transition(TokeniserState.Rcdata); else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp")) tokeniser.transition(TokeniserState.Rawtext); else if (contextTag.equals("script")) tokeniser.transition(TokeniserState.ScriptData); else if (contextTag.equals(("noscript"))) tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext else if (contextTag.equals("plaintext")) tokeniser.transition(TokeniserState.Data); else tokeniser.transition(TokeniserState.Data); // default root = new Element(Tag.valueOf("html"), baseUri); doc.appendChild(root); stack.push(root); resetInsertionMode(); // todo: setup form element to nearest form on context (up ancestor chain) } runParser(); if (context != null) return root.childNodes(); else return doc.childNodes(); } private void runParser() { while (true) { // todo: handle foreign content checks Token token = tokeniser.read(); process(token); if (token.type == Token.TokenType.EOF) break; } } boolean process(Token token) { currentToken = token; return this.state.process(token, this); } boolean process(Token token, TreeBuilderState state) { currentToken = token; return state.process(token, this); } void transition(TreeBuilderState state) { this.state = state; } TreeBuilderState state() { return state; } void markInsertionMode() { originalState = state; } TreeBuilderState originalState() { return originalState; } void framesetOk(boolean framesetOk) { this.framesetOk = framesetOk; } boolean framesetOk() { return framesetOk; } Element currentElement() { return stack.getLast(); } Document getDocument() { return doc; } String getBaseUri() { return baseUri; }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> void setBaseUri(Element base) { String href = base.absUrl("href"); if (href.length() != 0) { // ignore <base target> etc baseUri = href; doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base } } boolean isFragmentParsing() { return fragmentParsing; } void error(TreeBuilderState state) { if (trackErrors) errors.add(new ParseError("Unexpected token", state, currentToken, reader.pos())); } Element insert(Token.StartTag startTag) { // handle empty unknown tags // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate fake end tag. if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) { Element el = insertEmpty(startTag); process(new Token.EndTag(el.tagName())); // ensure we get out of whatever state we are in return el; } Element el = new Element(Tag.valueOf(startTag.name()), baseUri, startTag.attributes); insert(el); return el; } Element insert(String startTagName) { Element el = new Element(Tag.valueOf(startTagName), baseUri); insert(el); return el; } void insert(Element el) { insertNode(el); stack.add(el); } Element insertEmpty(Token.StartTag startTag) { Tag tag = Tag.valueOf(startTag.name()); Element el = new Element(tag, baseUri, startTag.attributes); insertNode(el); if (startTag.isSelfClosing()) { tokeniser.acknowledgeSelfClosingFlag(); if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output tag.setSelfClosing(); } return el; } void insert(Token.Comment commentToken) { Comment comment = new Comment(commentToken.getData(), baseUri); insertNode(comment); } void insert(Token.Character characterToken) { Node node; // characters in script and style go in as datanodes, not text nodes if (StringUtil.in(currentElement().tagName(), "script", "style")) node = new DataNode(characterToken.getData(), baseUri

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>); else node = new TextNode(characterToken.getData(), baseUri); currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. } private void insertNode(Node node) { // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc if (stack.size() == 0) doc.appendChild(node); else if (isFosterInserts()) insertInFosterParent(node); else currentElement().appendChild(node); } Element pop() { // todo - dev, remove validation check if (stack.peekLast().nodeName().equals("td") && !state.name().equals("InCell")) Validate.isFalse(true, "pop td not in cell"); if (stack.peekLast().nodeName().equals("html")) Validate.isFalse(true, "popping html!"); return stack.pollLast(); } void push(Element element) { stack.add(element); } LinkedList<Element> getStack() { return stack; } boolean onStack(Element el) { return isElementInQueue(stack, el); } private boolean isElementInQueue(LinkedList<Element> queue, Element element) { Iterator<Element> it = queue.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == element) { return true; } } return false; } Element getFromStack(String elName) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next.nodeName().equals(elName)) { return next; } } return null; } boolean removeFromStack(Element el) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == el) { it.remove(); return true; } } return false; } void popStackToClose(String elName) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next.nodeName().equals(elName)) { it.

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>remove(); break; } else { it.remove(); } } } void popStackToClose(String... elNames) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (StringUtil.in(next.nodeName(), elNames)) { it.remove(); break; } else { it.remove(); } } } void popStackToBefore(String elName) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next.nodeName().equals(elName)) { break; } else { it.remove(); } } } void clearStackToTableContext() { clearStackToContext("table"); } void clearStackToTableBodyContext() { clearStackToContext("tbody", "tfoot", "thead"); } void clearStackToTableRowContext() { clearStackToContext("tr"); } private void clearStackToContext(String... nodeNames) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html")) break; else it.remove(); } } Element aboveOnStack(Element el) { assert onStack(el); Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == el) { return it.next(); } } return null; } void insertOnStackAfter(Element after, Element in) { int i = stack.lastIndexOf(after); Validate.isTrue(i != -1); stack.add(i+1, in); } void replaceOnStack(Element out, Element in) { replaceInQueue(stack, out, in); } private void replaceInQueue(LinkedList<Element> queue, Element out, Element in) { int i = queue.lastIndexOf(out); Validate.isTrue(i != -1); queue.remove(i); queue.add(i, in); } void resetInsertionMode() { boolean last = false; Iterator<Element

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>> it = stack.descendingIterator(); while (it.hasNext()) { Element node = it.next(); if (!it.hasNext()) { last = true; node = contextElement; } String name = node.nodeName(); if ("select".equals(name)) { transition(TreeBuilderState.InSelect); break; // frag } else if (("td".equals(name) || "td".equals(name) && !last)) { transition(TreeBuilderState.InCell); break; } else if ("tr".equals(name)) { transition(TreeBuilderState.InRow); break; } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) { transition(TreeBuilderState.InTableBody); break; } else if ("caption".equals(name)) { transition(TreeBuilderState.InCaption); break; } else if ("colgroup".equals(name)) { transition(TreeBuilderState.InColumnGroup); break; // frag } else if ("table".equals(name)) { transition(TreeBuilderState.InTable); break; } else if ("head".equals(name)) { transition(TreeBuilderState.InBody); break; // frag } else if ("body".equals(name)) { transition(TreeBuilderState.InBody); break; } else if ("frameset".equals(name)) { transition(TreeBuilderState.InFrameset); break; // frag } else if ("html".equals(name)) { transition(TreeBuilderState.BeforeHead); break; // frag } else if (last) { transition(TreeBuilderState.InBody); break; // frag } } } // todo: tidy up in specific scope methods private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { return inSpecificScope(new String[]{targetName}, baseTypes, extraTypes); } private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element el = it.next(); String elName = el.nodeName(); if (StringUtil.in(elName, targetNames)) return true; if (StringUtil

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>.in(elName, baseTypes)) return false; if (extraTypes != null && StringUtil.in(elName, extraTypes)) return false; } Validate.fail("Should not be reachable"); return false; } boolean inScope(String[] targetNames) { return inSpecificScope(targetNames, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, null); } boolean inScope(String targetName) { return inScope(targetName, null); } boolean inScope(String targetName, String[] extras) { return inSpecificScope(targetName, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, extras); // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml // todo: in svg namespace: forignOjbect, desc, title } boolean inListItemScope(String targetName) { return inScope(targetName, new String[]{"ol", "ul"}); } boolean inButtonScope(String targetName) { return inScope(targetName, new String[]{"button"}); } boolean inTableScope(String targetName) { return inSpecificScope(targetName, new String[]{"html", "table"}, null); } boolean inSelectScope(String targetName) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element el = it.next(); String elName = el.nodeName(); if (elName.equals(targetName)) return true; if (!StringUtil.in(elName, "optgroup", "option")) // all elements except return false; } Validate.fail("Should not be reachable"); return false; } void setHeadElement(Element headElement) { this.headElement = headElement; } Element getHeadElement() { return headElement; } boolean isFosterInserts() { return fosterInserts; } void setFosterInserts(boolean fosterInserts) { this.fosterInserts = fosterInserts; } Element getFormElement() { return formElement; } void setFormElement(Element formElement) { this.formElement = form

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>Element; } void newPendingTableCharacters() { pendingTableCharacters = new ArrayList<Token.Character>(); } List<Token.Character> getPendingTableCharacters() { return pendingTableCharacters; } void setPendingTableCharacters(List<Token.Character> pendingTableCharacters) { this.pendingTableCharacters = pendingTableCharacters; } /** 11.2.5.2 Closing elements that have implied end tags<p/> When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element, the UA must pop the current node off the stack of open elements. @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list. */ void generateImpliedEndTags(String excludeTag) { while ((excludeTag != null && !currentElement().nodeName().equals(excludeTag)) && StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) pop(); } void generateImpliedEndTags() { generateImpliedEndTags(null); } boolean isSpecial(Element el) { // todo: mathml's mi, mo, mn // todo: svg's foreigObject, desc, title String name = el.nodeName(); return StringUtil.in(name, "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "isindex", "li", "link", "listing", "mar

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>(entry.nodeName()); // todo: avoid fostering here? // newEl.namespace(entry.namespace()); // todo: namespaces newEl.attributes().addAll(entry.attributes()); // 10. replace entry with new entry formattingElements.add(pos, newEl); formattingElements.remove(pos + 1); // 11 if (pos == size-1) // if not last entry in list, jump to 7 break; } } void clearFormattingElementsToLastMarker() { while (!formattingElements.isEmpty()) { Element el = formattingElements.peekLast(); formattingElements.removeLast(); if (el == null) break; } } void removeFromActiveFormattingElements(Element el) { Iterator<Element> it = formattingElements.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == el) { it.remove(); break; } } } boolean isInActiveFormattingElements(Element el) { return isElementInQueue(formattingElements, el); } Element getActiveFormattingElement(String nodeName) { Iterator<Element> it = formattingElements.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == null) // scope marker break; else if (next.nodeName().equals(nodeName)) return next; } return null; } void replaceActiveFormattingElement(Element out, Element in) { replaceInQueue(formattingElements, out, in); } void insertMarkerToFormattingElements() { formattingElements.add(null); } void insertInFosterParent(Node in) { Element fosterParent = null; Element lastTable = getFromStack("table"); boolean isLastTableParent = false; if (lastTable != null) { if (lastTable.parent() != null) { fosterParent = lastTable.parent(); isLastTableParent = true; } else fosterParent = aboveOnStack(lastTable); } else { // no table == frag fosterParent = stack.get(0); } if (isLastTableParent) lastTable.before(in); else fosterParent.appendChild(in); } @Override public String toString() { return "TreeBuilder{" +

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.nodes; import org.jsoup.helper.StringUtil; import org.jsoup.helper.Validate; /** A text node. @author Jonathan Hedley, jonathan@hedley.net */ public class TextNode extends Node { /* TextNode is a node, and so by default comes with attributes and children. The attributes are seldom used, but use memory, and the child nodes are never used. So we don't have them, and override accessors to attributes to create them as needed on the fly. */ private static final String TEXT_KEY = "text"; String text; /** Create a new TextNode representing the supplied (unencoded) text). @param text raw text @param baseUri base uri @see #createFromEncoded(String, String) */ public TextNode(String text, String baseUri) { this.baseUri = baseUri; this.text = text; } public String nodeName() { return "#text"; } /** * Get the text content of this text node. * @return Unencoded, normalised text. * @see TextNode#getWholeText() */ public String text() { return normaliseWhitespace(getWholeText()); } /** * Set the text content of this text node. * @param text unencoded text * @return this, for chaining */ public TextNode text(String text) { this.text = text; if (attributes != null) attributes.put(TEXT_KEY, text); return this; } /** Get the (unencoded) text of this text node, including any newlines and spaces present in the original. @return text */ public String getWholeText() { return attributes == null ? text : attributes.get(TEXT_KEY); } /** Test if this text node is blank -- that is, empty or only whitespace (including newlines). @return true if this document is empty or only whitespace, false if it contains any text content. */ public boolean isBlank() { return StringUtil.isBlank(getWholeText()); } /** * Split this text node into two nodes at the specified string offset. After splitting, this node will contain the * original text up to the offset, and will have a new text node sibling containing the text after the offset. *

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS> @param offset string offset point to split node at. * @return the newly created text node containing the text after the offset. */ public TextNode splitText(int offset) { Validate.isTrue(offset >= 0, "Split offset must be not be negative"); Validate.isTrue(offset < text.length(), "Split offset must not be greater than current text length"); String head = getWholeText().substring(0, offset); String tail = getWholeText().substring(offset); text(head); TextNode tailNode = new TextNode(tail, this.baseUri()); if (parent() != null) parent().addChildren(siblingIndex()+1, tailNode); return tailNode; } void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { String html = Entities.escape(getWholeText(), out); if (out.prettyPrint() && parent() instanceof Element && !((Element) parent()).preserveWhitespace()) { html = normaliseWhitespace(html); } if (out.prettyPrint() && siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) indent(accum, depth, out); accum.append(html); } void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} public String toString() { return outerHtml(); } /** * Create a new TextNode from HTML encoded (aka escaped) data. * @param encodedText Text containing encoded HTML (e.g. &amp;lt;) * @return TextNode containing unencoded data (e.g. &lt;) */ public static TextNode createFromEncoded(String encodedText, String baseUri) { String text = Entities.unescape(encodedText); return new TextNode(text, baseUri); } static String normaliseWhitespace(String text) { text = StringUtil.normaliseWhitespace(text); return text; } static String stripLeadingWhitespace(String text) { return text.replaceFirst("^\\s+", ""); } static boolean lastCharIsWhitespace(StringBuilder sb) { return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; } // attribute fiddling. create on first access. private void ensureAttributes() { if (attributes == null

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>) { attributes = new Attributes(); attributes.put(TEXT_KEY, text); } } @Override public String attr(String attributeKey) { ensureAttributes(); return super.attr(attributeKey); } @Override public Attributes attributes() { ensureAttributes(); return super.attributes(); } @Override public Node attr(String attributeKey, String attributeValue) { ensureAttributes(); return super.attr(attributeKey, attributeValue); } @Override public boolean hasAttr(String attributeKey) { ensureAttributes(); return super.hasAttr(attributeKey); } @Override public Node removeAttr(String attributeKey) { ensureAttributes(); return super.removeAttr(attributeKey); } @Override public String absUrl(String attributeKey) { ensureAttributes(); return super.absUrl(attributeKey); } }

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>package org.jsoup.helper; import java.util.Collection; import java.util.Iterator; /** * A minimal String utility class. Designed for interal jsoup use only. */ public final class StringUtil { // memoised padding up to 10 private static final String[] padding = {"", " ", " ", " ", " ", " ", " ", " ", " ", " ", " "}; /** * Join a collection of strings by a seperator * @param strings collection of string objects * @param sep string to place between strings * @return joined string */ public static String join(Collection strings, String sep) { return join(strings.iterator(), sep); } /** * Join a collection of strings by a seperator * @param strings iterator of string objects * @param sep string to place between strings * @return joined string */ public static String join(Iterator strings, String sep) { if (!strings.hasNext()) return ""; String start = strings.next().toString(); if (!strings.hasNext()) // only one, avoid builder return start; StringBuilder sb = new StringBuilder(64).append(start); while (strings.hasNext()) { sb.append(sep); sb.append(strings.next()); } return sb.toString(); } /** * Returns space padding * @param width amount of padding desired * @return string of spaces * width */ public static String padding(int width) { if (width < 0) throw new IllegalArgumentException("width must be > 0"); if (width < padding.length) return padding[width]; char[] out = new char[width]; for (int i = 0; i < width; i++) out[i] = ' '; return String.valueOf(out); } /** * Tests if a string is blank: null, emtpy, or only whitespace (" ", \r\n, \t, etc) * @param string string to test * @return if string is blank */ public static boolean isBlank(String string) { if (string == null || string.length() == 0) return true; int l = string.length(); for (int i = 0; i < l; i++) { if (!Character.isWhitespace

Jsoup, 14

<FILEB>
<CHANGES>
String appropriateEndTagName() {
return lastStartTag.tagName;
}
<CHANGEE>
<FILEE>
<FILEB>
<CHANGES>
} else if (r.matchesLetter() &&!r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
<CHANGEE>
<CHANGES>
t.tagPending = new Token.EndTag(t.appropriateEndTagName());
t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(Data);
<CHANGEE>
<FILEE>
<FILEB> void createCommentPending() { commentPending = new Token.Comment(); } void emitCommentPending() { emit(commentPending); } void createDoctypePending() { doctypePending = new Token.Doctype(); } void emitDoctypePending() { emit(doctypePending); } void createTempBuffer() { dataBuffer = new StringBuilder(); } boolean isAppropriateEndTagToken() { return tagPending.tagName.equals(lastStartTag.tagName); } <CHANGES> <CHANGEE> boolean isTrackErrors() { return trackErrors; } void setTrackErrors(boolean trackErrors) { this.trackErrors = trackErrors; } void error(TokeniserState state) { if (trackErrors) errors.add(new ParseError("Unexpected character in input", reader.current(), state, reader.pos())); } void eofError(TokeniserState state) { if (trackErrors) <FILEE> <FILEB> t.transition(Data); break; case nullChar: // replacement t.tagPending.appendTagName(replacementStr); break; case eof: // should emit pending tag? t.eofError(this); t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); <CHANGES> <CHANGEE> // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than // consuming to EOF; break out here <CHANGES> <CHANGEE> } else { t.emit("<"); t.transition(Rcdata); } } }, RCDATAEndTagOpen { void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); t.tagPending.appendTagName(Character.toLowerCase(r.current())); t.dataBuffer.append(Character.toLowerCase(r.current())); <FILEE> <SCANS>(string.codePointAt(i))) return false; } return true; } /** * Tests if a string is numeric, i.e. contains only digit characters * @param string string to test * @return true if only digit chars, false if empty or null or contains non-digit chrs */ public static boolean isNumeric(String string) { if (string == null || string.length() == 0) return false; int l = string.length(); for (int i = 0; i < l; i++) { if (!Character.isDigit(string.codePointAt(i))) return false; } return true; } public static String normaliseWhitespace(String string) { StringBuilder sb = new StringBuilder(string.length()); boolean lastWasWhite = false; boolean modified = false; int l = string.length(); for (int i = 0; i < l; i++) { int c = string.codePointAt(i); if (Character.isWhitespace(c)) { if (lastWasWhite) { modified = true; continue; } if (c != ' ') modified = true; sb.append(' '); lastWasWhite = true; } else { sb.appendCodePoint(c); lastWasWhite = false; } } return modified ? sb.toString() : string; } public static boolean in(String needle, String... haystack) { for (String hay : haystack) { if (hay.equals(needle)) return true; } return false; } }